# Import packages and load data

In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/twitter-data/master/twitt30k.csv')
df.head()

Unnamed: 0,twitts,sentiment
0,@robbiebronniman Sounds like a great night.,1
1,Damn the person who stolde my wallet !!!!! Ma...,1
2,Greetings from the piano bench (photo) http:/...,1
3,@drewryanscott i love it!! i love you!! haha f...,1
4,"@kissthestars Pretty pretty pretty please, pak...",0


Sentiment = 1: positive sentiment, 0: negative sentiment

In [5]:
df.shape

(30000, 2)

In [6]:
df['sentiment'].value_counts()

1    15000
0    15000
Name: sentiment, dtype: int64

Balanced dataset

# SVM model and Data Preparation

### Data preparation

In [7]:
X = df['twitts']
y = df['sentiment']

tfidf = TfidfVectorizer()
X = tfidf.fit_transform(X)
X.toarray()
X

<30000x40854 sparse matrix of type '<class 'numpy.float64'>'
	with 354357 stored elements in Compressed Sparse Row format>

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, stratify = y)

In [9]:
X_train.shape,X_test.shape

((24000, 40854), (6000, 40854))

### Build and train SVC model

In [10]:
clf = LinearSVC()

In [11]:
clf.fit(X_train,y_train)

LinearSVC()

In [12]:
y_pred = clf.predict(X_test)

### Test the accuracy

In [13]:
print('Classification_report')
print(classification_report(y_test,y_pred))

Classification_report
              precision    recall  f1-score   support

           0       0.77      0.72      0.74      3000
           1       0.74      0.78      0.76      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000



We got the accuracy score of 0.75, which is acceptable.

### Create method

In [14]:
def run_svm(df):
    X = df['twitts']
    y = df['sentiment']

    tfidf = TfidfVectorizer()
    X = tfidf.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, stratify = y)
    clf = LinearSVC()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print('Classification_report')
    print(classification_report(y_test,y_pred))
    
    return tfidf,clf

In [15]:
%%time

tfidf,clf = run_svm(df)

Classification_report
              precision    recall  f1-score   support

           0       0.76      0.74      0.75      3000
           1       0.74      0.76      0.75      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000

CPU times: user 658 ms, sys: 26 ms, total: 684 ms
Wall time: 714 ms


In [18]:
x = ['i am really happy, thanks a lot for coming with me','i hate you']

clf.predict(tfidf.transform(x))

array([1, 0])

So we have a result:
- The first sentence was predicted to have positive sentiment
- And the second sentence with negative sentiment