In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression


from sklearn.svm import SVC

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from stop_words import get_stop_words
from nltk.corpus import stopwords
from nltk import pos_tag

# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

In [4]:
DATASET_SIZE = 10000
MAX_BOW_SIZE = 100

# Part1

In [5]:
data_set = pd.read_csv('dataset.csv')
data_set = data_set[:DATASET_SIZE] 
data_set.head(5)

In [7]:
X = data_set['comment']
Y = data_set['sentiment']


# label binarization
label_binarizer = LabelBinarizer()
Y = label_binarizer.fit_transform(Y)
Y = np.ravel(Y)

# Part 2

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=5)

In [None]:
stop_words = list(get_stop_words('en'))
nltk_words = list(stopwords.words('english'))
stop_words.extend(nltk_words)

def lemmatize(text):
    list_pos = 0
    cleaned_str = ''
    lmtzr = WordNetLemmatizer()
    
    tagged_words = pos_tag(text)
    for word in tagged_words:
        if 'v' in word[1].lower():
            lemma = lmtzr.lemmatize(word[0], pos='v')
        else:
            lemma = lmtzr.lemmatize(word[0], pos='n')
        if list_pos == 0:
            cleaned_str = lemma
        else:
            cleaned_str = cleaned_str + ' ' + lemma
        list_pos += 1
    return cleaned_str

def clean(text):
    text = str(text).lower() #lowercase
    text = re.sub(r'\bid\b', 'i would', text) #start abbreviation
    text = re.sub(r'\bive\b', 'i have', text)
    text = re.sub(r'\bim\b', 'i am', text)
    text = re.sub(r'\bcant\b', 'can not', text)
    text = re.sub(r'\bdont\b', 'do not', text)
    text = re.sub(r'\bwont\b', 'will not', text)
    text = re.sub(r'\bthats\b', 'that is', text) #end abbreviation
    text = re.sub('[0-9]+', '', text) # delete numbers
    text = re.sub(r'[^\x00-\x7F]+',' ', text) #remove non-ascii
    text = re.sub('[<>{}=~.,،:\\!?\\-()\\[\\]#/@"]+|[_x000D_]+|\u200c+|[\r\n]', ' ', text) #remove punctuations
    word_list = nltk.word_tokenize(text)
    text = lemmatize(word_list)
    word_list = text.split()
    word_list = list(filter(lambda word: word not in stop_words, word_list)) # delete stopwords
    word_list = [w for w in word_list if len(w)>1] # delete len = 1
    return word_list


In [None]:
count_vectorizer = CountVectorizer(tokenizer=lambda text: clean(text), max_features=MAX_BOW_SIZE)

In [None]:
cv_X_train = count_vectorizer.fit_transform(X_train) #fit only over train data 
cv_X_test = count_vectorizer.transform(X_test) #apply not fit!

# Part 3.2

In [None]:
#### svm
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

clf = GridSearchCV(SVC(), tuned_parameters, cv = 4, scoring='accuracy')
clf.fit(cv_X_train, Y_train)

print("Best parameters set found on development set:")
print(clf.best_params_)
Y_test_pred = clf.predict(cv_X_test)
print(classification_report(Y_test, Y_test_pred))

#### knn

k_range = list(range(1,10))
weight_options = ["uniform", "distance"]

param_grid = dict(n_neighbors = k_range, weights = weight_options)
knn = KNeighborsClassifier()

clf = GridSearchCV(knn, param_grid, cv = 4, scoring = 'accuracy')
clf.fit(cv_X_train,Y_train)

print("Best parameters set found on development set:")
print (clf.best_params_)
print (clf.best_estimator_)
Y_test_pred = clf.predict(cv_X_test)
print(classification_report(Y_test, Y_test_pred))

#### logistic regression

grid_values = {'penalty': ['l2'], 'C': [0.001,0.01,0.1,1,10,100,1000]}
clf = GridSearchCV(LogisticRegression(), cv = 4, param_grid=grid_values, scoring = 'accuracy')

clf.fit(cv_X_train,Y_train)

print("Best parameters set found on development set:")
print (clf.best_params_)
print (clf.best_estimator_)
Y_test_pred = clf.predict(cv_X_test)
print(classification_report(Y_test, Y_test_pred))