# TRAINING MODEL FOR SENTIMENT ANALYSIS

In [None]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
from nltk.corpus import stopwords
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore", category=DeprecationWarning)
nltk.download('stopwords')

%matplotlib inline
# %install_ext https://raw.github.com/cpcloud/ipython-autotime/master/autotime.py
# %load_ext autotime
nltk.download('stopwords')

## LOAD DATA

In [None]:
training_data = pd.read_csv('sentiment data/training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None).loc[:, [0, 5]]
training_data.columns = ['label', 'content']
training_data = training_data.sample(frac=1).reset_index(drop=True)

## PROCESSED DATA

In [None]:
combi = training_data
combi.columns = ['label', 'tweet']
combi.loc[combi['label'] == 4, 'label'] = 1

### REMOVING TWITTER MENTION (@user)

In [None]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt

# remove twitter handles (@user)
combi['tidy_tweet'] = np.vectorize(remove_pattern)(combi['tweet'], "@[\w]*")

### REMOVING URLS

In [None]:
def remove_urls(df):
    df['tidy_tweet'] = df['tidy_tweet'].str.replace(r"http\S+", "")

remove_urls(combi)

### REMOVING PUNCTATION, NUMBERS AND SPECIAL CHARACTERS

In [None]:
# remove special characters, numbers, punctuations
combi['tidy_tweet'] = combi['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")

### REMOVING SHORT WORDS AND LOWER CASES

In [None]:
# remove short words (length <= 3)
combi['tidy_tweet'] = combi['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# lower case
combi['tidy_tweet'] = combi['tidy_tweet'].apply(lambda x: x.lower())

### REMOVING STOPWORDS

In [None]:
STOPWORDS = stopwords.words('english')
def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
combi['tidy_tweet'] = combi['tidy_tweet'].apply(lambda text: cleaning_stopwords(text))
combi['tidy_tweet'].head()

### TOKENIZATION

In [None]:
tokenized_tweet = combi['tidy_tweet'].apply(lambda x: x.split())
tokenized_tweet.head()

### STEMMING

In [None]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_tweet.head()

In [None]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])

combi['tidy_tweet'] = tokenized_tweet

## TRAIN MODEL

### SPLITTING TRAINING AND TESTING DATA

In [None]:
selected_df = combi

In [None]:
X = selected_df.loc[:15000, 'tidy_tweet']
y = selected_df.loc[:15000, 'label']

In [None]:
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=10000)
X = vectoriser.fit_transform(X)
X = X.toarray()

X_train = X[:10000, :]
X_test = X[10000:, :]

y_train = y.iloc[:10000].to_numpy()
y_test = y.iloc[10000:].to_numpy()

### SVM

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
  
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100], 
              'gamma': [1, 0.1, 0.01, 0.001],
              'kernel': ['rbf']} 
  
grid = RandomizedSearchCV(SVC(), param_grid, refit = True, verbose = 3, cv=3)
  
# fitting the model for grid search
grid.fit(X_train, y_train)

In [None]:
grid.best_estimator_.score(X_test, y_test)

In [None]:
from joblib import dump, load
dump(grid.best_estimator_, 'sentiment-SVM-model.joblib')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib tk
import seaborn as sns

C = [1, 10, 100, 100, 10, 1, 10, 0.1, 100, 1]
gamma = [0.01, 0.1, 1, 0.001, 1, 1, 0.01, 0.01, 0.01, 0.1]
accuracy = [0.512, 0.695, 0.693, 0.701, 0.693, 0.704, 0.700, 0.504, 0.694, 0.695]

In [None]:
data = pd.DataFrame({'C': C, 'gamma': gamma, 'accuracy': accuracy})
data_pivoted = data.pivot("C", "gamma", "accuracy")
sns.set(font_scale = 1.5)
ax = sns.heatmap(data_pivoted, annot=True, fmt='.3g', cmap="YlGnBu")
plt.show()