In [1]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import os

In [2]:
trainpath = os.path.join("data", "train_conll_hinglish.csv")

In [3]:
train = pd.read_csv(trainpath, sep='\\t', names=["ID","SENTENCE","LABEL"])

train.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,ID,SENTENCE,LABEL
0,3,@ AdilNisarButt pakistan ka ghra tauq he Pakis...,negative
1,41,Madarchod mulle ye mathura me Nahi dikha tha j...,negative
2,48,@ narendramodi Manya Pradhan Mantri mahoday Sh...,positive
3,64,@ Atheist_ Krishna Jcb full trend me chal rahi aa,positive
4,66,@ AbhisharSharma_@ RavishKumarBlog Loksabha me...,positive


In [4]:
test = pd.read_csv("sample_test.csv", names=["ID", "SENTENCE", "LABEL"])
test.head()

Unnamed: 0,ID,SENTENCE,LABEL
0,1,great books read very well,positive
1,2,boo hoo mar gaya bechara,negative


In [5]:
train.columns


Index(['ID', 'SENTENCE', 'LABEL'], dtype='object')

In [6]:
test.columns

Index(['ID', 'SENTENCE', 'LABEL'], dtype='object')

In [7]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(1, 4),
    dtype=np.float32,
    max_features=5000
)


In [8]:
char_vectorizer.fit(train['SENTENCE'])
train_char_features = char_vectorizer.transform(train['SENTENCE'])

In [9]:
char_vectorizer.fit(test['SENTENCE'])
test_char_features = char_vectorizer.transform(test['SENTENCE'])

In [10]:
train_char_features.shape

(15131, 5000)

In [11]:
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
model_NB = MultinomialNB()

X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(train_char_features, train.LABEL, train_size=0.75)
model_NB.fit(X_train_tfidf, y_train_tfidf)
predictions_tfidf = model_NB.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(y_test_tfidf, predictions_tfidf)
# accuracy_tfidf
precision, recall, f1, _ = precision_recall_fscore_support(y_test_tfidf, predictions_tfidf, average='macro')
print("[NaiveBayes] accuracy: {}, f1-score: {}, precision: {}, recall: {}".format(accuracy_tfidf, f1, precision, recall))

[NaiveBayes] accuracy: 0.5667459688078245, f1-score: 0.5642933788779518, precision: 0.5639581298395031, recall: 0.579701261317569


In [12]:
# test['Prediction'] = p

In [13]:
test.head()

Unnamed: 0,ID,SENTENCE,LABEL
0,1,great books read very well,positive
1,2,boo hoo mar gaya bechara,negative


In [14]:
def trainSVM(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf):
    clf = SVC(kernel='linear', gamma='auto')
    clf.fit(X_train_tfidf, y_train_tfidf)
    predictions_svc = clf.predict(X_test_tfidf)
    accuracy_svc = accuracy_score(y_test_tfidf, predictions_svc)
    # accuracy_svc
    precision, recall, f1, _ = precision_recall_fscore_support(y_test_tfidf, predictions_svc, average='macro')
    print("[Linear SVM] accuracy: {}, f1-score: {}, precision: {}, recall: {}".format(accuracy_svc, f1, precision, recall))

In [14]:
# 'linear' shows higher accuracy than 'rbf' (default)
clf = SVC(kernel='linear', gamma='auto')
clf.fit(X_train_tfidf, y_train_tfidf)
predictions_svc = clf.predict(X_test_tfidf)
accuracy_svc = accuracy_score(y_test_tfidf, predictions_svc)
# accuracy_svc
precision, recall, f1, _ = precision_recall_fscore_support(y_test_tfidf, predictions_svc, average='macro')
print("[Linear SVM] accuracy: {}, f1-score: {}, precision: {}, recall: {}".format(accuracy_svc, f1, precision, recall))

In [15]:
from sklearn.model_selection import LeaveOneOut
from sklearn import model_selection
X = train_char_features
y = train.LABEL
loo = LeaveOneOut()
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    trainSVM(X_train, y_train, X_test, y_test)