In [1]:
import pandas as pd
import numpy as np

In [65]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from stop_words import get_stop_words
from nltk.corpus import stopwords
from nltk import pos_tag

# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

In [130]:
DATASET_SIZE = 10000
MAX_BOW_SIZE = 100

# Part1

In [131]:
data_set = pd.read_csv('dataset.csv')
data_set = data_set[:DATASET_SIZE] 
data_set.head(5)

Unnamed: 0,comment,sentiment
0,"Oh my god, it just doesn't get any worse than ...",negative
1,If you're a layman interested in quantum theor...,negative
2,It's amazing that this no talent actor Chapa g...,negative
3,This must be one of the most overrated Spanish...,negative
4,Some critics have compared Chop Shop with the ...,positive


In [132]:
data_set.describe()

Unnamed: 0,comment,sentiment
count,10000,10000
unique,9983,2
top,This movie is amazing. You will NEVER laugh ha...,negative
freq,2,5037


In [133]:
X = data_set['comment']
Y = data_set['sentiment']


# label binarization
label_binarizer = LabelBinarizer()
Y = label_binarizer.fit_transform(Y)
Y = np.ravel(Y)

# Part 2

In [134]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=5)

### Part2: Without pre-processing

In [135]:
count_vectorizer = CountVectorizer(lowercase = False, max_features=MAX_BOW_SIZE, token_pattern="[a-zA-Z0-9_'.]{1,}")

In [136]:
cv_X_train = count_vectorizer.fit_transform(X_train) #fit only over train data 
cv_X_test = count_vectorizer.transform(X_test) #apply not fit!

In [137]:
clf = SVC()
clf.fit(cv_X_train, Y_train)

SVC()

In [138]:
Y_test_pred = clf.predict(cv_X_test)
print(classification_report(Y_test, Y_test_pred))

              precision    recall  f1-score   support

           0       0.73      0.72      0.73      1010
           1       0.72      0.73      0.73       990

    accuracy                           0.73      2000
   macro avg       0.73      0.73      0.73      2000
weighted avg       0.73      0.73      0.73      2000



### Part2: Elementry pre-processing

In [149]:
count_vectorizer = CountVectorizer(lowercase=True, max_features=MAX_BOW_SIZE, token_pattern="[a-zA-Z_]{2,}")

In [150]:
cv_X_train = count_vectorizer.fit_transform(X_train) #fit only over train data 
cv_X_test = count_vectorizer.transform(X_test) #apply not fit!

In [151]:
clf = SVC()
clf.fit(cv_X_train, Y_train)

SVC()

In [152]:
Y_test_pred = clf.predict(cv_X_test)
print(classification_report(Y_test, Y_test_pred))

              precision    recall  f1-score   support

           0       0.75      0.70      0.72      1010
           1       0.71      0.76      0.74       990

    accuracy                           0.73      2000
   macro avg       0.73      0.73      0.73      2000
weighted avg       0.73      0.73      0.73      2000



### Part2: Adcanced pre-processing

In [143]:
stop_words = list(get_stop_words('en'))
nltk_words = list(stopwords.words('english'))
stop_words.extend(nltk_words)

def lemmatize(text):
    list_pos = 0
    cleaned_str = ''
    lmtzr = WordNetLemmatizer()
    
    tagged_words = pos_tag(text)
    for word in tagged_words:
        if 'v' in word[1].lower():
            lemma = lmtzr.lemmatize(word[0], pos='v')
        else:
            lemma = lmtzr.lemmatize(word[0], pos='n')
        if list_pos == 0:
            cleaned_str = lemma
        else:
            cleaned_str = cleaned_str + ' ' + lemma
        list_pos += 1
    return cleaned_str

def clean(text):
    text = str(text).lower() #lowercase
    text = re.sub(r'\bid\b', 'i would', text) #start abbreviation
    text = re.sub(r'\bive\b', 'i have', text)
    text = re.sub(r'\bim\b', 'i am', text)
    text = re.sub(r'\bcant\b', 'can not', text)
    text = re.sub(r'\bdont\b', 'do not', text)
    text = re.sub(r'\bwont\b', 'will not', text)
    text = re.sub(r'\bthats\b', 'that is', text) #end abbreviation
    text = re.sub('[0-9]+', '', text) # delete numbers
    text = re.sub(r'[^\x00-\x7F]+',' ', text) #remove non-ascii
    text = re.sub('[<>{}=~.,،:\\!?\\-()\\[\\]#/@"]+|[_x000D_]+|\u200c+|[\r\n]', ' ', text) #remove punctuations
    word_list = nltk.word_tokenize(text)
    text = lemmatize(word_list)
    word_list = text.split()
    word_list = list(filter(lambda word: word not in stop_words, word_list)) # delete stopwords
    word_list = [w for w in word_list if len(w)>1] # delete len = 1
    return word_list


In [144]:
clean('i have done someging baD')

['someging', 'bad']

In [145]:
count_vectorizer = CountVectorizer(tokenizer=lambda text: clean(text), max_features=MAX_BOW_SIZE)

In [146]:
cv_X_train = count_vectorizer.fit_transform(X_train) #fit only over train data 
cv_X_test = count_vectorizer.transform(X_test) #apply not fit!

In [147]:
clf = SVC()
clf.fit(cv_X_train, Y_train)

SVC()

In [148]:
Y_test_pred = clf.predict(cv_X_test)
print(classification_report(Y_test, Y_test_pred))

              precision    recall  f1-score   support

           0       0.75      0.71      0.73      1010
           1       0.72      0.76      0.74       990

    accuracy                           0.73      2000
   macro avg       0.73      0.73      0.73      2000
weighted avg       0.73      0.73      0.73      2000

