In [113]:
%matplotlib inline
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.neighbors import KNeighborsClassifier

In [84]:
df = pd.read_csv("7Truth7LiesDataset.csv",sep = ',', quotechar="'",escapechar="\\")
df = df.sample(frac=1,random_state=0).reset_index(drop=True).copy()
df = df.rename(columns={"class":"clase"}) # class es palabra reservada
df['clase'] = np.where(df['clase'] == 'truth', 1, 0)

In [85]:
trainset_texto, testset_texto, trainset_clase, testset_clase =  train_test_split(df.text,df.clase,stratify=df.clase, test_size=0.3, random_state = 123)

In [106]:
vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000,
                             min_df=7, #stop_words='english',
                             use_idf=True)

# Build the tfidf vectorizer from the training data ("fit"), and apply it 
# ("transform").
X_train_tfidf = vectorizer.fit_transform(trainset_texto)

print("  Actual number of tfidf features: %d" % X_train_tfidf.get_shape()[1])

print("\nPerforming dimensionality reduction using LSA")
t0 = time.time()

  Actual number of tfidf features: 769

Performing dimensionality reduction using LSA


In [117]:
svd = TruncatedSVD(100, n_iter=500, random_state=123)
lsa = make_pipeline(svd, Normalizer(copy=False))

# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)

print("  done in %.3fsec" % (time.time() - t0))

explained_variance = svd.explained_variance_ratio_.sum()
print("  Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))

X_test_tfidf = vectorizer.transform(testset_texto)
X_test_lsa = lsa.transform(X_test_tfidf)

  done in 2399.880sec
  Explained variance of the SVD step: 47%


In [118]:
print("\nClassifying tfidf vectors...")

# Time this step.
t0 = time.time()

# Build a k-NN classifier. Use k = 5 (majority wins), the cosine distance, 
# and brute-force calculation of distances.
knn_tfidf = KNeighborsClassifier(n_neighbors=5, algorithm='brute', metric='cosine')
knn_tfidf.fit(X_train_tfidf, y_train)

# Classify the test vectors.
p = knn_tfidf.predict(X_test_tfidf)

# Measure accuracy
numRight = 0;
for i in range(len(p)):
    if p[i] == y_test[i]:
        numRight += 1

print('F-Score is {}'.format(f1_score(y_test, p, average="macro")))

# Calculate the elapsed time (in seconds)
elapsed = (time.time() - t0)
print("  done in %.3fsec" % elapsed)


Classifying tfidf vectors...
F-Score is 0.5657342861625183
  done in 0.280sec


In [119]:
# Build a k-NN classifier. Use k = 5 (majority wins), the cosine distance, 
# and brute-force calculation of distances.
y_train = trainset_clase.tolist()
y_test = testset_clase.tolist()

print("\nClassifying LSA vectors...")

# Time this step.
t0 = time.time()

# Build a k-NN classifier. Use k = 5 (majority wins), the cosine distance, 
# and brute-force calculation of distances.
knn_lsa = KNeighborsClassifier(n_neighbors=5, algorithm='brute', metric='cosine')
knn_lsa.fit(X_train_lsa, y_train)

# Classify the test vectors.
p = knn_lsa.predict(X_test_lsa)

print('F-Score is {}'.format(print(f1_score(y_test, p, average="macro"))))

# Calculate the elapsed time (in seconds)
elapsed = (time.time() - t0)    
print("    done in %.3fsec" % elapsed)


Classifying LSA vectors...
0.5620299841778993
F-Score is None
    done in 0.270sec
