In [20]:
import pandas as pd
import numpy as np
import re

In [25]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression


from sklearn.svm import SVC

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from stop_words import get_stop_words
from nltk.corpus import stopwords
from nltk import pos_tag

# ! pip install stopwords
# ! pip install stop_words

# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')
# nltk.download('wordnet')

In [3]:
DATASET_SIZE = 100
MAX_BOW_SIZE = 10

# Part1

In [4]:
data_set = pd.read_csv('dataset.csv')
data_set = data_set[:DATASET_SIZE] 
data_set.head(5)

Unnamed: 0,comment,sentiment
0,"Oh my god, it just doesn't get any worse than ...",negative
1,If you're a layman interested in quantum theor...,negative
2,It's amazing that this no talent actor Chapa g...,negative
3,This must be one of the most overrated Spanish...,negative
4,Some critics have compared Chop Shop with the ...,positive


In [5]:
data_set.describe()

Unnamed: 0,comment,sentiment
count,100,100
unique,100,2
top,One of the best silent dramas I've seen. As da...,positive
freq,1,50


In [6]:
X = data_set['comment']
Y = data_set['sentiment']


# label binarization
label_binarizer = LabelBinarizer()
Y = label_binarizer.fit_transform(Y)
Y = np.ravel(Y)

# Part 2,3

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=5)

### Part2: Without pre-processing
- lowercase is False
- Pattern: everything

In [8]:
count_vectorizer = CountVectorizer(lowercase = False, max_features=MAX_BOW_SIZE, token_pattern="[a-zA-Z0-9_'.]{1,}")

In [9]:
cv_X_train = count_vectorizer.fit_transform(X_train) #fit only over train data 
cv_X_test = count_vectorizer.transform(X_test) #apply not fit!

### Part 3.1: Without pre-processing

In [10]:
#### svm 

tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

clf = GridSearchCV(SVC(), tuned_parameters, cv = 4, scoring='accuracy')
clf.fit(cv_X_train, Y_train)

print("-------SVM-------:")
print("Best parameters set found on development set:")
print(clf.best_params_)
Y_test_pred = clf.predict(cv_X_test)
print(classification_report(Y_test, Y_test_pred))

#### knn

k_range = list(range(1,10))
weight_options = ["uniform", "distance"]

param_grid = dict(n_neighbors = k_range, weights = weight_options)
knn = KNeighborsClassifier()

clf = GridSearchCV(knn, param_grid, cv = 4, scoring = 'accuracy')
clf.fit(cv_X_train,Y_train)

print("-------KNN-------:")
print("Best parameters set found on development set:")
print (clf.best_params_)
print (clf.best_estimator_)
Y_test_pred = clf.predict(cv_X_test)
print(classification_report(Y_test, Y_test_pred))

#### logistic regression

grid_values = {'penalty': ['l2'], 'C': [0.001,0.01,0.1,1,10,100,1000]}
clf = GridSearchCV(LogisticRegression(), cv = 4, param_grid=grid_values, scoring = 'accuracy')

clf.fit(cv_X_train,Y_train)

print("-------LR-------:")
print("Best parameters set found on development set:")
print (clf.best_params_)
print (clf.best_estimator_)
Y_test_pred = clf.predict(cv_X_test)
print(classification_report(Y_test, Y_test_pred))

-------SVM-------:
Best parameters set found on development set:
{'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
              precision    recall  f1-score   support

           0       0.40      0.33      0.36        12
           1       0.20      0.25      0.22         8

    accuracy                           0.30        20
   macro avg       0.30      0.29      0.29        20
weighted avg       0.32      0.30      0.31        20

-------KNN-------:
Best parameters set found on development set:
{'n_neighbors': 9, 'weights': 'distance'}
KNeighborsClassifier(n_neighbors=9, weights='distance')
              precision    recall  f1-score   support

           0       0.62      0.42      0.50        12
           1       0.42      0.62      0.50         8

    accuracy                           0.50        20
   macro avg       0.52      0.52      0.50        20
weighted avg       0.54      0.50      0.50        20

-------LR-------:
Best parameters set found on development set:
{'C': 0.0

### Part2: Elementry pre-processing
- lowercase is True
- Pattern: just words with lenght>1

In [11]:
count_vectorizer = CountVectorizer(lowercase=True, max_features=MAX_BOW_SIZE, token_pattern="[a-zA-Z_]{2,}")

In [12]:
cv_X_train = count_vectorizer.fit_transform(X_train) #fit only over train data 
cv_X_test = count_vectorizer.transform(X_test) #apply not fit!

### Part 3.1: Elementry pre-processing

In [13]:
#### svm 

tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

clf = GridSearchCV(SVC(), tuned_parameters, cv = 4, scoring='accuracy')
clf.fit(cv_X_train, Y_train)

print("-------SVM-------:")
print("Best parameters set found on development set:")
print(clf.best_params_)
Y_test_pred = clf.predict(cv_X_test)
print(classification_report(Y_test, Y_test_pred))

#### knn

k_range = list(range(1,10))
weight_options = ["uniform", "distance"]

param_grid = dict(n_neighbors = k_range, weights = weight_options)
knn = KNeighborsClassifier()

clf = GridSearchCV(knn, param_grid, cv = 4, scoring = 'accuracy')
clf.fit(cv_X_train,Y_train)

print("-------KNN-------:")
print("Best parameters set found on development set:")
print (clf.best_params_)
print (clf.best_estimator_)
Y_test_pred = clf.predict(cv_X_test)
print(classification_report(Y_test, Y_test_pred))

#### logistic regression

grid_values = {'penalty': ['l2'], 'C': [0.001,0.01,0.1,1,10,100,1000]}
clf = GridSearchCV(LogisticRegression(), cv = 4, param_grid=grid_values, scoring = 'accuracy')

clf.fit(cv_X_train,Y_train)

print("-------LR-------:")
print("Best parameters set found on development set:")
print (clf.best_params_)
print (clf.best_estimator_)
Y_test_pred = clf.predict(cv_X_test)
print(classification_report(Y_test, Y_test_pred))

-------SVM-------:
Best parameters set found on development set:
{'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
              precision    recall  f1-score   support

           0       0.50      0.50      0.50        12
           1       0.25      0.25      0.25         8

    accuracy                           0.40        20
   macro avg       0.38      0.38      0.38        20
weighted avg       0.40      0.40      0.40        20

-------KNN-------:
Best parameters set found on development set:
{'n_neighbors': 8, 'weights': 'uniform'}
KNeighborsClassifier(n_neighbors=8)
              precision    recall  f1-score   support

           0       0.50      0.33      0.40        12
           1       0.33      0.50      0.40         8

    accuracy                           0.40        20
   macro avg       0.42      0.42      0.40        20
weighted avg       0.43      0.40      0.40        20

-------LR-------:
Best parameters set found on development set:
{'C': 0.001, 'penalty': 'l2'}
Log

### Part2: Adcanced pre-processing
- lowercase is True
- Pattern just words with lenght>1
- lemmatize
- stopwrods
- abbreviation

In [21]:
stop_words = list(get_stop_words('en'))
nltk_words = list(stopwords.words('english'))
stop_words.extend(nltk_words)

def lemmatize(text):
    list_pos = 0
    cleaned_str = ''
    lmtzr = WordNetLemmatizer()
    
    tagged_words = pos_tag(text)
    for word in tagged_words:
        if 'v' in word[1].lower():
            lemma = lmtzr.lemmatize(word[0], pos='v')
        else:
            lemma = lmtzr.lemmatize(word[0], pos='n')
        if list_pos == 0:
            cleaned_str = lemma
        else:
            cleaned_str = cleaned_str + ' ' + lemma
        list_pos += 1
    return cleaned_str

def clean(text):
    text = str(text).lower() #lowercase
    text = re.sub(r'\bid\b', 'i would', text) #start abbreviation
    text = re.sub(r'\bive\b', 'i have', text)
    text = re.sub(r'\bim\b', 'i am', text)
    text = re.sub(r'\bcant\b', 'can not', text)
    text = re.sub(r'\bdont\b', 'do not', text)
    text = re.sub(r'\bwont\b', 'will not', text)
    text = re.sub(r'\bthats\b', 'that is', text) #end abbreviation
    text = re.sub('[0-9]+', '', text) # delete numbers
    text = re.sub(r'[^\x00-\x7F]+',' ', text) #remove non-ascii
    text = re.sub('[<>{}=~.,،:\\!?\\-()\\[\\]#/@"]+|[_x000D_]+|\u200c+|[\r\n]', ' ', text) #remove punctuations
    word_list = nltk.word_tokenize(text)
    text = lemmatize(word_list)
    word_list = text.split()
    word_list = list(filter(lambda word: word not in stop_words, word_list)) # delete stopwords
    word_list = [w for w in word_list if len(w)>1] # delete len = 1
    return word_list


In [22]:
count_vectorizer = CountVectorizer(tokenizer=lambda text: clean(text), max_features=MAX_BOW_SIZE)

In [26]:
cv_X_train = count_vectorizer.fit_transform(X_train) #fit only over train data 
cv_X_test = count_vectorizer.transform(X_test) #apply not fit!

### Part 3.1: Advanced pre-processing

In [27]:
#### svm 

tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

clf = GridSearchCV(SVC(), tuned_parameters, cv = 4, scoring='accuracy')
clf.fit(cv_X_train, Y_train)

print("-------SVM-------:")
print("Best parameters set found on development set:")
print(clf.best_params_)
Y_test_pred = clf.predict(cv_X_test)
print(classification_report(Y_test, Y_test_pred))

#### knn

k_range = list(range(1,10))
weight_options = ["uniform", "distance"]

param_grid = dict(n_neighbors = k_range, weights = weight_options)
knn = KNeighborsClassifier()

clf = GridSearchCV(knn, param_grid, cv = 4, scoring = 'accuracy')
clf.fit(cv_X_train,Y_train)

print("-------KNN-------:")
print("Best parameters set found on development set:")
print (clf.best_params_)
print (clf.best_estimator_)
Y_test_pred = clf.predict(cv_X_test)
print(classification_report(Y_test, Y_test_pred))

#### logistic regression

grid_values = {'penalty': ['l2'], 'C': [0.001,0.01,0.1,1,10,100,1000]}
clf = GridSearchCV(LogisticRegression(), cv = 4, param_grid=grid_values, scoring = 'accuracy')

clf.fit(cv_X_train,Y_train)

print("-------LR-------:")
print("Best parameters set found on development set:")
print (clf.best_params_)
print (clf.best_estimator_)
Y_test_pred = clf.predict(cv_X_test)
print(classification_report(Y_test, Y_test_pred))

-------SVM-------:
Best parameters set found on development set:
{'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
              precision    recall  f1-score   support

           0       1.00      0.08      0.15        12
           1       0.42      1.00      0.59         8

    accuracy                           0.45        20
   macro avg       0.71      0.54      0.37        20
weighted avg       0.77      0.45      0.33        20

-------KNN-------:
Best parameters set found on development set:
{'n_neighbors': 2, 'weights': 'uniform'}
KNeighborsClassifier(n_neighbors=2)
              precision    recall  f1-score   support

           0       0.54      0.58      0.56        12
           1       0.29      0.25      0.27         8

    accuracy                           0.45        20
   macro avg       0.41      0.42      0.41        20
weighted avg       0.44      0.45      0.44        20

-------LR-------:
Best parameters set found on development set:
{'C': 0.001, 'penalty': 'l2'}
L