In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from matplotlib import pyplot
from numpy import where
from itertools import chain

train = pd.read_csv('train.csv')
dev = pd.read_csv('dev.csv')

In [2]:
# stop words
positive = train[train['label']==1]
true_word_list = []
for i in range(0, positive['review'].count()):
    true_word_temp = positive['review'].iloc[i].split()
    true_word_list.append(true_word_temp)

list1 = list(chain.from_iterable(true_word_list))
true_mc = Counter(list1).most_common()

negative = train[train['label']==0]
false_word_list = []
for i in range(0, negative['review'].count()):
    false_word_temp = negative['review'].iloc[i].split()
    false_word_list.append(false_word_temp)
    
list2 = list(chain.from_iterable(false_word_list))
false_mc = Counter(list2).most_common()

df_true = pd.DataFrame(true_mc)[0:25]
df_false = pd.DataFrame(false_mc)[0:25]
common_words = list(set(df_true.iloc[:,0]).intersection(set(df_false.iloc[:,0])))

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
y_train = train['label']
y_dev = dev['label']
X_train = train['review']
X_dev = dev['review']
tfidf = TfidfVectorizer(stop_words = common_words, ngram_range=(1, 2),max_features=10000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_dev_tfidf = tfidf.transform(X_dev)

In [None]:
# weighted by n_samples / (n_classes * np.bincount(y))
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score

model = SVC(gamma='scale',class_weight='balanced')
model.fit(X_train_tfidf,y_train)
y_pred = model.predict(X_dev_tfidf)
rocauc = roc_auc_score(y_dev,y_pred)
ap = average_precision_score(y_dev,y_pred)
print(rocauc,ap)

In [None]:
# weighted by 10 for fake review
model = SVC(gamma='scale',class_weight={1:10})#kernal='linear'
model.fit(X_train_tfidf,y_train)
y_pred = model.predict(X_dev_tfidf)
rocauc=roc_auc_score(y_dev,y_pred)
ap=average_precision_score(y_dev,y_pred)
print(rocauc,ap)

In [None]:
# tuning
weights = ['balanced',5,10,20,50]
rocaucs = []
aps = []
for w in weights:
    if w == 'balanced':
        model = SVC(gamma='scale',class_weight='balanced')
    else:
        model = SVC(gamma='scale',class_weight={1:w})
    model.fit(X_train_tfidf,y_train)
    y_pred = model.predict(X_dev_tfidf)
    rocauc=roc_auc_score(y_dev,y_pred)
    ap=average_precision_score(y_dev,y_pred)
    rocaucs.append(rocauc)
    aps.append(ap)

In [None]:
# plot
plt.plot(weights[1:], rocaucs[1:])
plt.x_label('weighted option')
plt.y_label('ROC_AUC on dev')
plt.title('ROC_AUC')
plt.show()

plt.plot(weights[1:], aps[1:])
plt.x_label('weighted option')
plt.y_label('AP on dev')
plt.title('AP')
plt.show()