In [None]:
# Import Libraries

from nltk.corpus import stopwords
import string
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import random
import re
import pandas as pd
from bs4 import BeautifulSoup

import unicodedata
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [None]:
df = pd.read_csv('../input/isear.txt')
df = df.drop('Unnamed: 2',axis=1) # drop unwanted column


In [None]:
stop_words = set(stopwords.words('english')) # stopwords
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')

In [None]:
# Clean the data

import re
def clean_text(line):
    line = BeautifulSoup(line, "lxml").text # HTML decoding

    line= re.sub(pattern='[^a-zA-Z]',repl=' ',string=str(line))
    line = REPLACE_BY_SPACE_RE.sub(' ', line) # replace REPLACE_BY_SPACE_RE symbols by space in text
    line = BAD_SYMBOLS_RE.sub('', line) # delete symbols which are in BAD_SYMBOLS_RE from text
    line = line.lower()
    token_words = line.split()
    token_words = [word for word in token_words if not word in set(stopwords.words('english'))]
   # porter = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    lem_words = [lemmatizer.lemmatize(i) for i in token_words]
    #stem_words = [porter.stem(i) for i in lem_words]
    final_words = ' '.join(lem_words)
    return final_words

clean = lambda x: clean_text(x)

In [None]:
df['text'] = pd.DataFrame(df.text.apply(clean)) # apply a clean lambda function on text column
df.head()

In [None]:
X = df['text'] 
y = df['label']

In [None]:
word_vectorizer = CountVectorizer(ngram_range=(1, 2),min_df=2,max_df=0.5)
# fit and transform on it the training features
word_vectorizer.fit(X)
vectors = word_vectorizer.transform(X)
X_vec = vectors.toarray()

In [None]:
word_vectorizer.vocabulary_# vocabulary of words

In [None]:
print("Each of the %d text is represented by %d features " %(vectors.shape))

In [None]:
# Label Encoding
from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder() 
labels= label_encoder.fit_transform(y) 


In [None]:
# train test split
X_train,X_test,y_train,y_test = train_test_split(X_vec,y,test_size=0.3,stratify =labels)

In [None]:
# fit model
from sklearn.ensemble import RandomForestClassifier

rbf = RandomForestClassifier(random_state = 1, max_depth = 35, n_estimators = 300, min_samples_split = 2, min_samples_leaf = 1)
rbf.fit(X_train,y_train)

In [None]:
rbf.score(X_train,y_train)

In [None]:
pred = rbf.predict(X_test)

In [None]:
# Accuracy, Precision and Recall
from sklearn.metrics import accuracy_score, precision_score, recall_score
score1 = accuracy_score(y_test, pred)
score2 = precision_score(y_test, pred,average='weighted')
score3 = recall_score(y_test, pred,average='weighted')
print("---- Scores ----")
print("Accuracy score is: {}%".format(round(score1*100,2)))
print("Precision score is: {}".format(round(score2,2)))
print("Recall score is: {}".format(round(score3,2)))

In [None]:
print(classification_report(y_test, pred))

In [None]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
lr_cm = confusion_matrix(y_test, pred)

In [None]:
import re

def emotion_prediction(sample):
    sample = re.sub(pattern='[^a-zA-Z]',repl=' ',string=sample)
    sample = sample.lower()
    sample_words = sample.split()
    sample_words = [word for word in sample_words if not word in set(stopwords.words('english'))]
    final_words = ' '.join(sample_words)
    
    temp = word_vectorizer.transform([final_words])
    return rbf.predict(temp)

In [None]:
# Predicting values
import numpy as np
#row = randint(0 ,sample_test.shape[0]-1)
#sample_news = sample_test.iloc[row][1]
sample = "I am very much happy by your behavior"

prediction = emotion_prediction(sample)
#prediction = label_encoder.inverse_transform(prediction)
prediction = np.array_str(prediction).strip("['']")
print('The emotion of the "{}" is : {}'.format(sample,prediction.upper()))

In [None]:
# Save Model
import pickle
pickle.dump(rbf,open('rbf_model3.pkl','wb'))
pickle.dump(word_vectorizer,open('cv_transform1.pkl','wb'))