# TFIDF + ML Models

### Libraries

In [9]:
## Download and import libraries

import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from nltk.corpus import stopwords

In [10]:
## Uncomment the below line to download if stopwords are missing 
#nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words=stopwords.words('english')

### Dataset

In [11]:
## Read train and test dataset
## Please provide the path where training and testing files exist

train_df=pd.read_excel('P1_training.xlsx')
test_df=pd.read_excel('P1_testing.xlsx')
train_df

Unnamed: 0,sentence,label
0,living in a concentration camp-like atmosphere...,1
1,"there's even a nod to "" the blues brothers , ""...",1
2,"park , lord , and screenwriter karey kirkpatri...",1
3,"ginger is perfect , spunky and opinionated , b...",2
4,jane horrocks delivers a lovely voice characte...,2
...,...,...
1655,"lin shae , who plays mary's neighbor magda ( a...",2
1656,steve martin took an extended vacation from al...,2
1657,much of the book spares tinseltown from mocker...,2
1658,"now , as writer and star of bowfinger , he off...",1


### Preprocessing the Dataset

In [12]:
## Cleaning the sentences for training and testing data
## Removes punctuations and stopwords

def get_tokens(sentence):
    return sentence.split()

def text_preprocessing(df):
    df['sentence']=df['sentence'].str.lower()
    df['sentence']=df['sentence'].str.replace('[^\w\s]','')
    df['sentence'] = df['sentence'].apply(lambda x: ' '.join([word for word in get_tokens(x) if word not in (stop_words)]))
    return df

train_df=text_preprocessing(train_df)
test_df=text_preprocessing(test_df)
train_df    

Unnamed: 0,sentence,label
0,living concentration camplike atmosphere led o...,1
1,theres even nod blues brothers believe filmmak...,1
2,park lord screenwriter karey kirkpatrick reali...,1
3,ginger perfect spunky opinionated soft heart f...,2
4,jane horrocks delivers lovely voice characteri...,2
...,...,...
1655,lin shae plays marys neighbor magda also appea...,2
1656,steve martin took extended vacation facets mov...,2
1657,much book spares tinseltown mockery although r...,2
1658,writer star bowfinger offers masses plenty goo...,1


### Vectorize the data

In [13]:
## TFIDF Vectors

tfidf=TfidfVectorizer()
X_train=train_df['sentence']
Y_train=train_df['label']
X_test=test_df['sentence']
Y_test=test_df['label']

## Vectorize the data
X_train=tfidf.fit_transform(X_train)
X_test=tfidf.transform(X_test)


### ML Classifiers

In [24]:
# Random Forest Classifier
rfc=RandomForestClassifier(n_estimators=15)
rfc.fit(X_train,Y_train)
y_pred=rfc.predict(X_test)
print('Accuracy %s \n' % accuracy_score(y_pred, Y_test))
print ("Classification Report \n",classification_report(Y_test,y_pred))

Accuracy 0.5563689604685212 

Classification Report 
               precision    recall  f1-score   support

           0       0.17      0.02      0.04        82
           1       0.55      0.63      0.59       303
           2       0.58      0.63      0.60       298

    accuracy                           0.56       683
   macro avg       0.43      0.43      0.41       683
weighted avg       0.52      0.56      0.53       683



In [15]:
# Support Vector Classifier
svc=LinearSVC()
svc.fit(X_train,Y_train)
y_pred=svc.predict(X_test)
print('Accuracy %s \n' % accuracy_score(y_pred, Y_test))
print ("Classification Report \n",classification_report(Y_test,y_pred))

Accuracy 0.5724743777452416 

Classification Report 
               precision    recall  f1-score   support

           0       0.13      0.04      0.06        82
           1       0.58      0.68      0.62       303
           2       0.60      0.61      0.61       298

    accuracy                           0.57       683
   macro avg       0.44      0.44      0.43       683
weighted avg       0.53      0.57      0.55       683



In [8]:
## Here SVC performs better as compared to Random Forest and we consider SVC 
## Please provide the path where testing file exist
temp_df=pd.read_excel('P1_testing.xlsx')
temp_df['predicted_label']=y_pred
temp_df=temp_df.rename(columns={"label": "golden_label"})
temp_df.to_csv("testing_output_TFIDF_SVC.csv")