In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression


from sklearn.svm import SVC

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from stop_words import get_stop_words
from nltk.corpus import stopwords
from nltk import pos_tag

# ! pip install stopwords
# ! pip install stop_words

# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')
# nltk.download('wordnet')

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
MAX_BOW_SIZE = 1000
DATASET_SIZE = 10000

In [5]:
def analysis(labels, predictions):
    print("Report Classification: \n", classification_report(labels, predictions, target_names=['positive', 'negative']))
    print("Matrix Confusion: \n", confusion_matrix(labels, predictions))
    print("Accuracy: \n", accuracy_score(labels,predictions))
    


# Part1

In [6]:
data_set = pd.read_csv('dataset.csv')
data_set = data_set[:DATASET_SIZE]
data_set.head(5)

Unnamed: 0,comment,sentiment
0,"Oh my god, it just doesn't get any worse than ...",negative
1,If you're a layman interested in quantum theor...,negative
2,It's amazing that this no talent actor Chapa g...,negative
3,This must be one of the most overrated Spanish...,negative
4,Some critics have compared Chop Shop with the ...,positive


In [7]:
data_set.describe()

Unnamed: 0,comment,sentiment
count,10000,10000
unique,9983,2
top,"Elfriede Jelinek, not quite a household name y...",negative
freq,2,5037


In [8]:
X = data_set['comment']
Y = data_set['sentiment']


# label binarization
label_binarizer = LabelBinarizer()
Y = label_binarizer.fit_transform(Y)
Y = np.ravel(Y)
print(Y[:5])

[0 0 0 0 1]


# Part 2,3

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=5)

### Part2: Without pre-processing
- lowercase is False
- Pattern: everything

In [10]:
count_vectorizer = CountVectorizer(lowercase = False, max_features=MAX_BOW_SIZE, token_pattern="[a-zA-Z0-9_'.]{1,}")

In [11]:
cv_X_train = count_vectorizer.fit_transform(X_train) #fit only over train data 
cv_X_test = count_vectorizer.transform(X_test) #apply not fit!

### Part 3.1: Without pre-processing

In [None]:
#### svm 

tuned_parameters = [{'kernel': ['rbf'], 'C': [1, 10]}]

clf = GridSearchCV(SVC(), tuned_parameters, scoring ='f1')
clf.fit(cv_X_train, Y_train)

print("-------SVM-------:")
print("Best parameters set found on development set:")
print(clf.best_params_)
Y_test_pred = clf.predict(cv_X_test)
analysis(Y_test, Y_test_pred)

#### knn

k_range = list(range(1,31))
weight_options = ["uniform", "distance"]

param_grid = dict(n_neighbors = k_range, weights = weight_options)
knn = KNeighborsClassifier()

clf = GridSearchCV(knn, param_grid, scoring ='f1')
clf.fit(cv_X_train,Y_train)

print("-------KNN-------:")
print("Best parameters set found on development set:")
print (clf.best_params_)
print (clf.best_estimator_)
Y_test_pred = clf.predict(cv_X_test)
analysis(Y_test, Y_test_pred)

#### logistic regression

grid_values = {'penalty': ['l2'], 'C': [1,10]}
clf = GridSearchCV(LogisticRegression(), param_grid=grid_values, scoring = 'f1')

clf.fit(cv_X_train,Y_train)

print("-------LR-------:")
print("Best parameters set found on development set:")
print (clf.best_params_)
print (clf.best_estimator_)
Y_test_pred = clf.predict(cv_X_test)
analysis(Y_test, Y_test_pred)

### Part2: Elementry pre-processing
- lowercase is True
- Pattern: just words with lenght>1

In [None]:
count_vectorizer = CountVectorizer(lowercase=True, max_features=MAX_BOW_SIZE, token_pattern="[a-zA-Z_]{2,}")

In [None]:
cv_X_train = count_vectorizer.fit_transform(X_train) #fit only over train data 
cv_X_test = count_vectorizer.transform(X_test) #apply not fit!

### Part 3.1: Elementry pre-processing

In [None]:
#### svm 

tuned_parameters = [{'kernel': ['rbf'], 'C': [1, 10]}]

clf = GridSearchCV(SVC(), tuned_parameters, scoring ='f1')
clf.fit(cv_X_train, Y_train)

print("-------SVM-------:")
print("Best parameters set found on development set:")
print(clf.best_params_)
Y_test_pred = clf.predict(cv_X_test)
analysis(Y_test, Y_test_pred)

#### knn

k_range = list(range(1,10))
weight_options = ["uniform", "distance"]

param_grid = dict(n_neighbors = k_range, weights = weight_options)
knn = KNeighborsClassifier()

clf = GridSearchCV(knn, param_grid, scoring ='f1')
clf.fit(cv_X_train,Y_train)

print("-------KNN-------:")
print("Best parameters set found on development set:")
print (clf.best_params_)
print (clf.best_estimator_)
Y_test_pred = clf.predict(cv_X_test)
analysis(Y_test, Y_test_pred)

#### logistic regression

grid_values = {'penalty': ['l2'], 'C': [1,10]}
clf = GridSearchCV(LogisticRegression(), param_grid=grid_values, scoring = 'f1')

clf.fit(cv_X_train,Y_train)

print("-------LR-------:")
print("Best parameters set found on development set:")
print (clf.best_params_)
print (clf.best_estimator_)
Y_test_pred = clf.predict(cv_X_test)
analysis(Y_test, Y_test_pred)

### Part2: Adcanced pre-processing
- lowercase is True
- Pattern just words with lenght>1
- lemmatize
- stopwrods
- abbreviation

In [None]:
stop_words = list(get_stop_words('en'))
nltk_words = list(stopwords.words('english'))
stop_words.extend(nltk_words)

def lemmatize(text):
    list_pos = 0
    cleaned_str = ''
    lmtzr = WordNetLemmatizer()
    
    tagged_words = pos_tag(text)
    for word in tagged_words:
        if 'v' in word[1].lower():
            lemma = lmtzr.lemmatize(word[0], pos='v')
        else:
            lemma = lmtzr.lemmatize(word[0], pos='n')
        if list_pos == 0:
            cleaned_str = lemma
        else:
            cleaned_str = cleaned_str + ' ' + lemma
        list_pos += 1
    return cleaned_str

def clean(text):
    text = str(text).lower() #lowercase
    text = re.sub(r'\bid\b', 'i would', text) #start abbreviation
    text = re.sub(r'\bive\b', 'i have', text)
    text = re.sub(r'\bim\b', 'i am', text)
    text = re.sub(r'\bcant\b', 'can not', text)
    text = re.sub(r'\bdont\b', 'do not', text)
    text = re.sub(r'\bwont\b', 'will not', text)
    text = re.sub(r'\bthats\b', 'that is', text) #end abbreviation
    text = re.sub('[0-9]+', '', text) # delete numbers
    text = re.sub(r'[^\x00-\x7F]+',' ', text) #remove non-ascii
    text = re.sub('[<>{}=~.,،:\\!?\\-()\\[\\]#/@"]+|[_x000D_]+|\u200c+|[\r\n]', ' ', text) #remove punctuations
    word_list = nltk.word_tokenize(text)
    text = lemmatize(word_list)
    word_list = text.split()
    word_list = list(filter(lambda word: word not in stop_words, word_list)) # delete stopwords
    word_list = [w for w in word_list if len(w)>1] # delete len = 1
    return word_list


In [None]:
count_vectorizer = CountVectorizer(tokenizer=lambda text: clean(text), max_features=MAX_BOW_SIZE)

In [None]:
cv_X_train = count_vectorizer.fit_transform(X_train) #fit only over train data 
cv_X_test = count_vectorizer.transform(X_test) #apply not fit!

### Part 3.1: Advanced pre-processing

In [None]:
#### svm 

tuned_parameters = [{'kernel': ['rbf'], 'C': [1, 10]}]

clf = GridSearchCV(SVC(), tuned_parameters, scoring ='f1')
clf.fit(cv_X_train, Y_train)

print("-------SVM-------:")
print("Best parameters set found on development set:")
print(clf.best_params_)
Y_test_pred = clf.predict(cv_X_test)
analysis(Y_test, Y_test_pred)

#### knn

k_range = list(range(1,10))
weight_options = ["uniform", "distance"]

param_grid = dict(n_neighbors = k_range, weights = weight_options)
knn = KNeighborsClassifier()

clf = GridSearchCV(knn, param_grid, scoring ='f1')
clf.fit(cv_X_train,Y_train)

print("-------KNN-------:")
print("Best parameters set found on development set:")
print (clf.best_params_)
print (clf.best_estimator_)
Y_test_pred = clf.predict(cv_X_test)
analysis(Y_test, Y_test_pred)

#### logistic regression

grid_values = {'penalty': ['l2'], 'C': [1,10]}
clf = GridSearchCV(LogisticRegression(), param_grid=grid_values, scoring = 'f1')

clf.fit(cv_X_train,Y_train)

print("-------LR-------:")
print("Best parameters set found on development set:")
print (clf.best_params_)
print (clf.best_estimator_)
Y_test_pred = clf.predict(cv_X_test)
analysis(Y_test, Y_test_pred)