In [79]:
import pandas as pd
import numpy as np
import re
import string
import nltk
import unidecode
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score,confusion_matrix,plot_confusion_matrix
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
import xgboost
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
import seaborn as sns
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn import preprocessing
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [80]:
df = pd.read_csv("datasets/sentiment_train.csv")
pd.set_option('display.max_colwidth',None)
df.head()

Unnamed: 0,Sentence,Polarity
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.,1
4,The selection on the menu was great and so were the prices.,1


In [4]:
df.isna().sum()

Sentence    0
Polarity    0
dtype: int64

In [48]:
#checking for blank spaces
blanks=[]
for index,Sentence,Polarity in df.itertuples():
    if type(Sentence)==str:
        if Sentence.isspace():
            blanks.append(index)
print(blanks)

[]


In [49]:
#checking distribution in target feature.No imbalance
df['Polarity'].value_counts()/len(df)

0    0.505417
1    0.494583
Name: Polarity, dtype: float64

In [9]:
for rows in df["Sentence"]:
    if re.findall(r'\#NAME',rows):
        print(rows)

#NAME?
#NAME?
#NAME?
#NAME?


In [81]:
#removing those useless columns that Excel marked as #NAME?
df.drop(df[df["Sentence"]=="#NAME?"].index, inplace = True)

# Feature engineering

In [82]:
import textstat
sid=SentimentIntensityAnalyzer()

def doc_length(corpus):
    return np.array([len(doc)-doc.count(" ") for doc in corpus]).reshape(-1, 1)

def lexicon_count(corpus):
    return np.array([textstat.lexicon_count(doc) for doc in corpus]).reshape(-1, 1)

def flesch_reading_ease(corpus):
    return np.array([textstat.flesch_reading_ease(doc) for doc in corpus]).reshape(-1, 1)

def automated_readability(corpus):
    return np.array([textstat.automated_readability_index(doc) for doc in corpus]).reshape(-1, 1)

#found % of words that are capitalized thinking maybe positive reviews have more often capitalization
def get_caps(doc):
    return sum([1 for char in doc if char.isupper()])
    
def capitalized_count(corpus):
    return np.array([get_caps(doc) for doc in corpus]).reshape(-1, 1)

def get_digit(doc):
    return sum([1 for char in doc if char.isdigit()])

def digit_count(corpus):
    return np.array([get_digit(doc) for doc in corpus]).reshape(-1, 1)

def get_punctuation(doc):
    return sum([1 for char in doc if char in string.punctuation])

def punctuation_count(corpus):
    return np.array([get_punctuation(doc) for doc in corpus]).reshape(-1, 1)

def num_exclamation_marks(corpus):
    return np.array([doc.count('!') for doc in corpus]).reshape(-1, 1)

def extract_smile(corpus):
    return np.array([bool(re.search(":\)|:\(|;\)", doc.lower())) for doc in corpus]).reshape(-1, 1)

def string_search(corpus):
    return np.array([bool(re.search("recommend|love|is the best|amazing", doc.lower())) for doc in corpus]).reshape(-1, 1)

def sentiment(corpus):
    return np.array([sid.polarity_scores(doc)["compound"] for doc in corpus]).reshape(-1, 1)

In [83]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import unidecode
import textstat
import string  

lemmer = WordNetLemmatizer()
stop_words = set(stopwords.words('english') + stopwords.words('spanish'))

def clean_text(doc):
    
    doc=BeautifulSoup(doc, ["lxml"]).get_text()
    
    # Lowercase and remove punctuation
    doc = "".join([char.lower() for char in doc if char not in string.punctuation])
    
    # Replace URL with URL string
    doc=re.sub(r'http\S+|www\S+', '', doc).strip()
    
    # Replace AT with AT string
    doc = re.sub(r'@', 'AT', doc)
    
    # Replace all numbers/digits 
    doc= re.sub(r'\d+', '', doc)
    
    #tokenize
    doc=re.split('\W+', doc.strip()) 
    
    #unidecode
    doc=[unidecode.unidecode(tokens) for tokens in doc]
        
    # Lemmatize each word.
    doc = ' '.join([lemmer.lemmatize(w) for w in doc ]) 
        
    return doc

In [86]:
from sklearn.model_selection import train_test_split

X = df['Sentence']
y = df['Polarity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [87]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import Normalizer
import time

# This vectorizer will be used to create the BOW features
vectorizer = TfidfVectorizer(preprocessor=clean_text, 
                             max_features = 1000, 
                             ngram_range=[1,4],
                             stop_words=None,
                             strip_accents="unicode", 
                             lowercase=False, max_df=0.8, min_df=0.02,use_idf=True)

lr = LogisticRegression(random_state=42) 
dt = DecisionTreeClassifier(random_state=42)
gb=xgboost.XGBClassifier(random_state=42)
rf = RandomForestClassifier(random_state=42, n_estimators=200)
ada = AdaBoostClassifier(random_state=42, n_estimators=200)
svm=SVC(random_state=42,probability=True)


feature_processing =  FeatureUnion([ 
    ('bow', Pipeline([('cv', vectorizer),])),
    ('length', FunctionTransformer(doc_length, validate=False)),
    ('words', FunctionTransformer(lexicon_count, validate=False)),
    ('flesch_reading_ease', FunctionTransformer(flesch_reading_ease, validate=False)),
    ('automated_readability', FunctionTransformer(automated_readability, validate=False)),    
    ('punctuation_count', FunctionTransformer(punctuation_count, validate=False)),
    ('capital_count', FunctionTransformer(capitalized_count, validate=False)),
    ('digit_count', FunctionTransformer(digit_count, validate=False)),
    ('has_smile', FunctionTransformer(extract_smile, validate=False)),
    ('num_exclamation_marks', FunctionTransformer(num_exclamation_marks, validate=False)),  
    ('has_string', FunctionTransformer(string_search, validate=False)),  
    ('sentiment', FunctionTransformer(sentiment, validate=False)),
])

classifiers = {
    "LR": lr, 
    "DT": dt,
    "RF": rf,
    "Adaboost": ada,
    "SVM":svm,
    "GBC": gb
}

model_results = list()
    
for model_name,model in classifiers.items():
    start = time.time()
    pipe = Pipeline([('features', feature_processing), ('classifier', model)])
    fitting=pipe.fit(X_train,y_train)
    end = time.time()
    total = end - start
    y_pred=fitting.predict(X_test)
    f1=f1_score(y_test, y_pred, average='macro')
    accuracy=accuracy_score(y_test, y_pred)
     
    df_results = pd.DataFrame({"Method": [model_name],
                               "Time" : [total],
                               "F1" : [f1],
                               "Accuracy":[accuracy]
                             })
    model_results.append(df_results)
    
    dataset_results = pd.concat([m for m in model_results], axis = 0).reset_index() 
    dataset_results = dataset_results.drop(columns = "index",axis =1)
    dataset_results = dataset_results.sort_values(by=['F1'], ascending=False)
    dataset_results['Rank'] = range(1, len(dataset_results)+1)
print(dataset_results)

     Method      Time        F1  Accuracy  Rank
2        RF  2.858246  0.841391  0.841667     1
0        LR  1.788133  0.832916  0.833333     2
5       GBC  2.211996  0.822355  0.822917     3
3  Adaboost  2.514186  0.803949  0.804167     4
1        DT  1.595626  0.797452  0.797917     5
4       SVM  3.205747  0.476849  0.556250     6


In [88]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, f1_score

y_pred = pipe.predict(X_test)

print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nF1 Score = {:.5f}".format(f1_score(y_test, y_pred, average="micro")))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion matrix:
[[211  35]
 [ 50 184]]

F1 Score = 0.82292

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.86      0.83       246
           1       0.84      0.79      0.81       234

    accuracy                           0.82       480
   macro avg       0.82      0.82      0.82       480
weighted avg       0.82      0.82      0.82       480



# Adding GridSearch

In [89]:
vectorizer = TfidfVectorizer(preprocessor=clean_text, 
                             max_features = 1000, 
                             ngram_range=[1,4],
                             stop_words=None,
                             strip_accents="unicode", 
                             lowercase=False, max_df=0.8, min_df=0.02,use_idf=True)

lr = LogisticRegression(random_state=42) 
dt = DecisionTreeClassifier(random_state=42)
gb=xgboost.XGBClassifier(random_state=42)
rf = RandomForestClassifier(random_state=42)
ada = AdaBoostClassifier(random_state=42, n_estimators=200)
svm=SVC(random_state=42,probability=True)


feature_processing =  FeatureUnion([ 
    ('bow', Pipeline([('cv', vectorizer),])),
    ('length', FunctionTransformer(doc_length, validate=False)),
    ('words', FunctionTransformer(lexicon_count, validate=False)),
    ('flesch_reading_ease', FunctionTransformer(flesch_reading_ease, validate=False)),
    ('automated_readability', FunctionTransformer(automated_readability, validate=False)),    
    ('punctuation_count', FunctionTransformer(punctuation_count, validate=False)),
    ('capital_count', FunctionTransformer(capitalized_count, validate=False)),
    ('digit_count', FunctionTransformer(digit_count, validate=False)),
    ('has_smile', FunctionTransformer(extract_smile, validate=False)),
    ('num_exclamation_marks', FunctionTransformer(num_exclamation_marks, validate=False)),  
    ('has_string', FunctionTransformer(string_search, validate=False)),  
    ('sentiment', FunctionTransformer(sentiment, validate=False)),
])

classifiers = {
     "LR": lr, 
     "DT": dt,
     "RF": rf,
     "Adaboost": ada,
     "GBC": gb
}

model_results = list()
    
for model_name,model in classifiers.items():
    start = time.time()    
    pipeline = Pipeline(steps=[('preprocess',feature_processing),
                               ('classifier',model)])
    
    if model==rf:
        param_grid = { 
        'classifier__max_features': [25,50, 200, 500, 1000],
        'classifier__max_depth':[1,2,5,8,10,12,15,20],
        'classifier__n_estimators': [100, 200,300],
        'classifier__criterion':["gini","entropy"]
    }
        
    elif model==lr:
        param_grid = { 
           'classifier__C':np.logspace(-5,1,2,8,10),
           'classifier__max_iter':[50,100,200,300,400],
           'classifier__penalty':["l1","l2","elasticnet","none"]
    }
    elif model==dt:
        param_grid = { 
        'classifier__max_features': [50, 200, 500],
        'classifier__max_depth':[1,2,5,10,15,20]
    }
    elif model==ada:
        param_grid = { 
        'classifier__learning_rate': [0.01,0.1,0.5,1],
        'classifier__n_estimators':[50,100,150,200]
    }
    elif model==gb:
        param_grid_gb = { 
        'classifier__learning_rate': [0.05, 0.10, 0.15, 0.20, 0.25, 0.30] ,
        'classifier__max_depth': [ 3, 4, 5, 6, 8, 10, 12, 15],
        'classifier__n_estimators':[50,100,150,200]
    }
    
    pipeline = Pipeline(steps=[('preprocess',feature_processing),
                               ('classifier',model)])

    
    CV = GridSearchCV(pipeline, param_grid, n_jobs= 1)
                  
    fitted=CV.fit(X_train, y_train)  
    print(CV.best_params_)    
    print(CV.best_score_)
    
    end = time.time()
    total = end - start
    
    y_pred=fitted.predict(X_test)
    f1=f1_score(y_test, y_pred, average='macro')
    accuracy=accuracy_score(y_test, y_pred)
    
    df_results = pd.DataFrame({"Method": [model_name],
                               "Time" : [total],
                               "F1" : [f1],
                               "Accuracy":[accuracy]
                             })
    model_results.append(df_results)

    dataset_results = pd.concat([m for m in model_results], axis = 0).reset_index() 
    dataset_results = dataset_results.drop(columns = "index",axis =1)
    dataset_results = dataset_results.sort_values(by=['F1'], ascending=False)
    dataset_results['Rank'] = range(1, len(dataset_results)+1)
print(dataset_results)


{'classifier__C': 1e-05, 'classifier__max_iter': 200, 'classifier__penalty': 'none'}
0.8376795039164492
{'classifier__max_depth': 5, 'classifier__max_features': 50}
0.8100182223672758
{'classifier__criterion': 'gini', 'classifier__max_depth': 12, 'classifier__max_features': 25, 'classifier__n_estimators': 100}
0.8340187119234116
{'classifier__learning_rate': 0.1, 'classifier__n_estimators': 200}
0.8293312119234116
{'classifier__learning_rate': 0.1, 'classifier__n_estimators': 150}
0.8293230526544821
     Method         Time        F1  Accuracy  Rank
3  Adaboost   171.849932  0.832843  0.833333     1
0        LR   307.198583  0.830926  0.831250     2
2        RF  2469.851460  0.826534  0.827083     3
4       GBC   157.729462  0.826450  0.827083     4
1        DT   129.945246  0.813703  0.814583     5


In [90]:
CV.score(X_test, y_test)

0.8270833333333333

In [91]:
fitted.best_params_

{'classifier__learning_rate': 0.1, 'classifier__n_estimators': 150}

In [31]:
# fitted=CV.fit(X_train, y_train)

In [92]:
pipeline = Pipeline(steps=[('preprocess',feature_processing),
                               ('classifier',AdaBoostClassifier(learning_rate= 0.1, n_estimators=150,random_state=42))])
fit=pipeline.fit(X_train,y_train)

In [93]:
# Get the feature processing pipeline, so I can use it later
feature_processing_obj = pipeline.named_steps['preprocess']

In [101]:
df_test = pd.read_csv("datasets/sentiment_test.csv")

In [102]:
features_test = feature_processing_obj.transform(df_test['Sentence']).todense()
pred_test = pipeline.predict(df_test['Sentence'])