# Import packages and load dataset

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.feature_extraction.text import TfidfVectorizer

'''My Preprocessing package'''
import preprocess_hungcuongthan as pp

In [5]:
df = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/twitter-data/master/twitt30k.csv')
df.head()

Unnamed: 0,twitts,sentiment
0,@robbiebronniman Sounds like a great night.,1
1,Damn the person who stolde my wallet !!!!! Ma...,1
2,Greetings from the piano bench (photo) http:/...,1
3,@drewryanscott i love it!! i love you!! haha f...,1
4,"@kissthestars Pretty pretty pretty please, pak...",0


# Text Preprocessing

### Lowercase the twitts

In [7]:
df['twitts'] = df['twitts'].apply(lambda x: x.lower())

In [8]:
df['twitts'].head(2)

0         @robbiebronniman sounds like a great night. 
1    damn the person who stolde my wallet !!!!!  ma...
Name: twitts, dtype: object

### Contraction to expansion (example: I'm --> I am)

In [9]:
df['twitts'] = df['twitts'].apply(lambda x: pp.get_contraction_expansion(x))

### Remove unrelated information from twitts (emails, urls, etc.)

In [10]:
# remove emails
df['twitts'] = df['twitts'].apply(lambda x: pp.remove_emails(x))

In [11]:
#remove urls
df['twitts'] = df['twitts'].apply(lambda x: pp.remove_urls(x))

In [12]:
#remove rt (retwitts)
df['twitts'] = df['twitts'].apply(lambda x: pp.remove_rt(x))

In [13]:
#remove html tags
df['twitts'] = df['twitts'].apply(lambda x: pp.remove_html_tags(x))

In [14]:
#remove special characters
df['twitts'] = df['twitts'].apply(lambda x: pp.remove_special_chars(x))

# Build and train basic SVC

In [15]:
def run_svm(df):
    X = df['twitts']
    y = df['sentiment']

    tfidf = TfidfVectorizer()
    X = tfidf.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, stratify = y)
    clf = LinearSVC()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print('Classification_report')
    print(classification_report(y_test,y_pred))
    
    return tfidf,clf

In [16]:
tfidf, clf = run_svm(df)

Classification_report
              precision    recall  f1-score   support

           0       0.75      0.74      0.75      3000
           1       0.75      0.75      0.75      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000



In [17]:
# test our own cases

x = ['i am really happy, thanks a lot for coming with me','i hate you']

clf.predict(tfidf.transform(x))

array([1, 0])

# Fine-tuning SVC model

In [23]:
def run_svm(df):
    X = df['twitts']
    y = df['sentiment']

    tfidf = TfidfVectorizer(norm = 'l1' #change norm l2 --> l1
                           ,max_features= 5000 #limit the number of features or tokens
                           )
    X = tfidf.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, stratify = y)
    clf = LinearSVC()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print('Classification_report')
    print(classification_report(y_test,y_pred))
    
    return tfidf,clf

In [24]:
run_svm(df)

Classification_report
              precision    recall  f1-score   support

           0       0.76      0.76      0.76      3000
           1       0.76      0.77      0.76      3000

    accuracy                           0.76      6000
   macro avg       0.76      0.76      0.76      6000
weighted avg       0.76      0.76      0.76      6000



(TfidfVectorizer(max_features=5000, norm='l1'), LinearSVC())

# Save and load ML model

Save our model so that we can use it later

In [25]:
import pickle

In [27]:
#save the classifier
pickle.dump(clf, open('clf.pkl','wb')) #wb: write in binary form

pickle.dump(tfidf,open('tfidf.pkl','wb'))

In [28]:
#Delete previous models and tfidf
del clf
del tfidf

In [29]:
#Load the model
clf = pickle.load(open('clf.pkl','rb'))
tfidf = pickle.load(open('tfidf.pkl','rb'))

In [30]:
clf

LinearSVC()

In [31]:
tfidf

TfidfVectorizer()

In [34]:
x = ['you are great','you are the goat']
clf.predict(tfidf.transform(x))

array([1, 1])