In [None]:
from google.colab import drive 
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np

import json 

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

import matplotlib.pyplot as plt
import pydot
from sklearn import tree
from sklearn.tree import export_graphviz
import shutil

In [None]:
df = pd.read_csv('/content/drive/MyDrive/sem8/nlp/mini p /dataset.csv')
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.drop(df.index[df['sentiment']=='neutral'], axis=0, inplace=True)
df = df.replace('positive', 1)
df = df.replace('negative', 0)
df.head()

Unnamed: 0,texts,sentiment
0,लोग वतन तक खा जाते हैं इसका इसे यकीन नहींमान ज...,0
1,गुमनाम है वतन पर मिटने वाले लोग आतन्कवादियों स...,0
2,ज़ंजीर बदली जा रही थी मैं समझा था रिहाई हो गयी है,0
3,यूपी में बड़े स्तर पर दंगे करवा सकती है बीजेपी...,0
4,अंग्रेजी नहीं आती है इसलिए हिन्दी ट्विट ज्यादा...,0


In [None]:
df.shape

(6428, 2)

In [None]:
df.loc[df['sentiment']==1].count()

texts        3254
sentiment    3254
dtype: int64

In [None]:
df.loc[df['sentiment']==0].count()

texts        3174
sentiment    3174
dtype: int64

## Preprocessing

In [None]:
texts = df['texts']
sentiment = df['sentiment']

In [None]:
# tokenizing the text
tokenized_text = []
for each in texts:
    lst = each.split(' ')
    tokenized_text.append(lst)

In [None]:
# assigning each word an integer
count = 1
int_dict = {}
for each in tokenized_text:
    for each_one in each:
        if each_one not in int_dict:
            int_dict[each_one] = count 
            count = count + 1

In [None]:
# text to sequences
text_to_sequences = []
for each in tokenized_text:
    lst = []
    for each_one in each:
        lst.append(int_dict[each_one])
    text_to_sequences.append(lst)

In [None]:
# finding out the length of the longest piece of text
max_len = 0
for each in tokenized_text:
    if len(each) > max_len:
        max_len = len(each)
        value = each

print(value, '\n', max_len)

['क्या', 'मिले', 'हुए', 'मौके', 'को', 'ऐसे', 'गंवाया', 'जा', 'सकता', 'है?', 'समस्या', 'यह', 'है', 'कि', 'अभी', 'ठीक', 'ढंग', 'से', 'स्थापित', 'नहीं', 'हो', 'सके', 'विशाल', 'भारद्वाज', 'और', 'अनुराग', 'कश्यप', 'की', 'शैलियों', 'की', 'नकल', 'में', 'मनीष', 'तिवारी', 'अपनी', 'पहली', 'फिल्म', "'दिल", 'दोस्ती', "एटसेट्रा'", 'की', 'सादगी', 'और', 'गहराई', 'भी', 'भूल', 'गए', 'हैं'] 
 48


In [None]:
# padded the text to sequences to make them all the same length 
padded = []
for each in text_to_sequences:
    difference = max_len - len(each)
    each = [0]*difference + each
    padded.append(each)

# converting list to numpy array
padded1 = np.asarray(padded, dtype=np.float64)

## Training and Testing

In [None]:
X = padded1
y = sentiment

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [None]:
# this function helps in plotting the accuracy of the models
name_arr = []
acc_arr = []
def lst_append(name, acc):
  name_arr.append(name)
  acc_arr.append(acc)
  
def model_perf(model, name):
  model.fit(X_train, y_train)
  preds = model.predict(X_test)
  acc = round(accuracy_score(y_test, preds), 5)*100
  acc_lst = lst_append(name, acc)
  print('======================================')
  print('Accuracy ' + f'{name}' + ':', acc,'\n')
  print(classification_report(y_test, preds))
  print('======================================')

LogReg = LogisticRegression(random_state=0, max_iter=10000)
model_perf(LogReg, 'Logistic Regression')

RFC = RandomForestClassifier(criterion = 'gini')
model_perf(RFC, 'Random Forest')

SGDC = SGDClassifier(max_iter=1000, tol=1e-3)
model_perf(SGDC, 'Stochastic Gradient Descent')

svm = SVC()
model_perf(svm, 'Support Vector Machine')

LSVC = LinearSVC(random_state=0, tol=1e-5)
model_perf(LSVC, 'Linear Support Vector Classification')

GNB = GaussianNB()
model_perf(GNB, 'Naive Bayes')

BNB = BernoulliNB()
model_perf(BNB, 'Bernoulli Naive Bayes')

MNB = MultinomialNB()
model_perf(MNB, 'Multinomial Naive Bayes')

AdaB = AdaBoostClassifier(n_estimators=100, random_state=0)
model_perf(AdaB, 'AdaBoost')

LGBM = LGBMClassifier(random_state=5)
model_perf(LGBM, 'Light Gradient Boosting Machine')

GBC = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
model_perf(GBC, 'Gradient Boost Classifier')

Accuracy Logistic Regression: 58.631 

              precision    recall  f1-score   support

           0       0.59      0.59      0.59       979
           1       0.58      0.58      0.58       950

    accuracy                           0.59      1929
   macro avg       0.59      0.59      0.59      1929
weighted avg       0.59      0.59      0.59      1929

Accuracy Random Forest: 81.545 

              precision    recall  f1-score   support

           0       0.83      0.80      0.82       979
           1       0.80      0.83      0.82       950

    accuracy                           0.82      1929
   macro avg       0.82      0.82      0.82      1929
weighted avg       0.82      0.82      0.82      1929

Accuracy Stochastic Gradient Descent: 51.115 

              precision    recall  f1-score   support

           0       0.52      0.43      0.47       979
           1       0.50      0.59      0.54       950

    accuracy                           0.51      1929
   macro 


Liblinear failed to converge, increase the number of iterations.



Accuracy Linear Support Vector Classification: 55.21 

              precision    recall  f1-score   support

           0       0.64      0.28      0.38       979
           1       0.53      0.84      0.65       950

    accuracy                           0.55      1929
   macro avg       0.58      0.56      0.52      1929
weighted avg       0.58      0.55      0.51      1929

Accuracy Naive Bayes: 49.819 

              precision    recall  f1-score   support

           0       0.72      0.02      0.04       979
           1       0.50      0.99      0.66       950

    accuracy                           0.50      1929
   macro avg       0.61      0.51      0.35      1929
weighted avg       0.61      0.50      0.34      1929

Accuracy Bernoulli Naive Bayes: 51.322 

              precision    recall  f1-score   support

           0       0.53      0.41      0.46       979
           1       0.50      0.62      0.55       950

    accuracy                           0.51      1929
 

In [None]:
dict2 = {'Name of the model': name_arr, 'Accuracy': acc_arr}
perf = pd.DataFrame(dict2)
fig = px.bar(perf, x='Name of the model', y='Accuracy')
fig.show()

https://wortschatz.uni-leipzig.de/en/download/Hindi

In [None]:
with open('/content/drive/MyDrive/sem8/nlp/mini p /hindi_words2.json', 'r') as fp:
  data = json.load(fp)

In [None]:
count = max(data.values())
for i in tokenized_text:
  for j in i:
    if j not in data.keys():
      # print(j)
      data[j] = count 
      count = count + 1

In [None]:
# text to sequences
txt_to_sequences = []
for each in tokenized_text:
    lst = []
    for each_one in each:
        lst.append(data[each_one])
    txt_to_sequences.append(lst)

In [None]:
# padded the text to sequences to make them all the same length 
padded = []
for each in txt_to_sequences:
    difference = max_len - len(each)
    each = [0]*difference + each
    padded.append(each)

# converting list to numpy array
padded2 = np.asarray(padded, dtype=np.float64)

In [None]:
X = padded2
y = sentiment

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [None]:
# this function helps in plotting the accuracy of the models
name_arr = []
acc_arr = []
def lst_append(name, acc):
  name_arr.append(name)
  acc_arr.append(acc)
  
def model_perf(model, name):
  model.fit(X_train, y_train)
  preds = model.predict(X_test)
  acc = round(accuracy_score(y_test, preds), 5)*100
  acc_lst = lst_append(name, acc)
  print('======================================')
  print('Accuracy ' + f'{name}' + ':', acc,'\n')
  print(classification_report(y_test, preds))
  print('======================================')

Log_Reg = LogisticRegression(random_state=0, max_iter=10000)
model_perf(LogReg, 'Logistic Regression')

RFC = RandomForestClassifier(criterion = 'gini', oob_score=True)
model_perf(RFC, 'Random Forest')

SGDC = SGDClassifier(max_iter=1000, tol=1e-3)
model_perf(SGDC, 'Stochastic Gradient Descent')

svm = SVC()
model_perf(svm, 'Support Vector Machine')

LSVC = LinearSVC(random_state=0, tol=1e-5)
model_perf(LSVC, 'Linear Support Vector Classification')

GNB = GaussianNB()
model_perf(GNB, 'Naive Bayes')

BNB = BernoulliNB()
model_perf(BNB, 'Bernoulli Naive Bayes')

MNB = MultinomialNB()
model_perf(MNB, 'Multinomial Naive Bayes')

AdaB = AdaBoostClassifier(n_estimators=100, random_state=0)
model_perf(AdaB, 'AdaBoost')

LGBM = LGBMClassifier(random_state=5)
model_perf(LGBM, 'Light Gradient Boosting Machine')

GBC = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
model_perf(GBC, 'Gradient Boost Classifier')

Accuracy Logistic Regression: 52.566 

              precision    recall  f1-score   support

           0       0.55      0.39      0.45       979
           1       0.51      0.67      0.58       950

    accuracy                           0.53      1929
   macro avg       0.53      0.53      0.52      1929
weighted avg       0.53      0.53      0.52      1929

Accuracy Random Forest: 73.14699999999999 

              precision    recall  f1-score   support

           0       0.72      0.77      0.74       979
           1       0.74      0.69      0.72       950

    accuracy                           0.73      1929
   macro avg       0.73      0.73      0.73      1929
weighted avg       0.73      0.73      0.73      1929

Accuracy Stochastic Gradient Descent: 50.441 

              precision    recall  f1-score   support

           0       0.51      0.95      0.66       979
           1       0.46      0.04      0.08       950

    accuracy                           0.50      192


Liblinear failed to converge, increase the number of iterations.



Accuracy Linear Support Vector Classification: 50.492000000000004 

              precision    recall  f1-score   support

           0       0.51      0.94      0.66       979
           1       0.48      0.06      0.10       950

    accuracy                           0.50      1929
   macro avg       0.49      0.50      0.38      1929
weighted avg       0.49      0.50      0.39      1929

Accuracy Naive Bayes: 49.611 

              precision    recall  f1-score   support

           0       0.68      0.01      0.03       979
           1       0.49      0.99      0.66       950

    accuracy                           0.50      1929
   macro avg       0.59      0.50      0.34      1929
weighted avg       0.59      0.50      0.34      1929

Accuracy Bernoulli Naive Bayes: 51.322 

              precision    recall  f1-score   support

           0       0.53      0.41      0.46       979
           1       0.50      0.62      0.55       950

    accuracy                           0.5

In [None]:
dict2 = {'Name of the model': name_arr, 'Accuracy': acc_arr}
perf = pd.DataFrame(dict2)
fig = px.bar(perf, x='Name of the model', y='Accuracy')
fig.show()

## Taking input from user

In [None]:
def preprocess(text):
  # 1 -> positive
  # 0 -> negative
  print(text)
  max_len = 48
  txt_seq = []
  lst = []
  tokenized_input = text.split(' ')
  for each in tokenized_input:
    txt_seq.append(data[each])
  if len(txt_seq)<max_len:
    difference = max_len - len(txt_seq)
    # print(difference)
    zeroes = [0]*difference
    lst = zeroes + txt_seq
  print(lst)
  lst1 = np.asarray(lst, dtype=np.float64)
  new_lst1 = lst1.reshape(1, -1)
  a = RFC.predict(new_lst1)
  b = LogReg.predict(new_lst1)
  c = LGBM.predict(new_lst1)
  print(a, b, c)

In [None]:
preprocess(texts[0])
print('\n')
print(sentiment[0])

लोग वतन तक खा जाते हैं इसका इसे यकीन नहींमान जाएगा तू ले जाकर दिल्ली इसे दिखा ला दोस्त
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 827084, 662847, 348077, 255655, 203843, 940029, 464374, 587313, 741558, 57945, 860293, 509542, 820683, 286930, 596854, 587313, 48614, 600634, 275464]
[0] [1] [0]


0


In [None]:
t1 = 'क्रूज ड्रग्स केस में गवाह प्रभाकर की मौत वकील ने बताया हार्ट अटैक से गई जान'
# Witness Prabhakar's death in Rouge Drugs case Lawyer told that he died due to heart attack
preprocess(t1)

क्रूज ड्रग्स केस में गवाह प्रभाकर की मौत वकील ने बताया हार्ट अटैक से गई जान
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 565454, 510347, 589442, 535586, 215535, 636119, 122044, 538035, 432068, 442596, 443639, 683702, 663013, 76216, 148173, 943776]
[1] [1] [1]


In [None]:
t2 = 'सहरी में पिएं ये स्वादिष्ट हेल्दी शेक रोज़े में नहीं लगेगी प्यास जानें विधि'
# Drink this delicious healthy shake in Sehri, you will not feel thirsty in fasting, know the method
preprocess(t2)

सहरी में पिएं ये स्वादिष्ट हेल्दी शेक रोज़े में नहीं लगेगी प्यास जानें विधि
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 883019, 535586, 417565, 57194, 963819, 383777, 732222, 597733, 535586, 272194, 570886, 211923, 651410, 574345]
[1] [1] [0]


## Downloading the trained models

In [None]:
'''import pickle

with open('model.pkl', 'wb') as files:
    pickle.dump(RFC, files)'''

"import pickle\n\nwith open('model.pkl', 'wb') as files:\n    pickle.dump(RFC, files)"

## Understanding the model

In [None]:
# The child estimator template used to create the collection of fitted sub-estimators.
RFC.base_estimator

DecisionTreeClassifier()

In [None]:
# The collection of fitted sub-estimators.
RFC.estimators_

[DecisionTreeClassifier(max_features='auto', random_state=1126597297),
 DecisionTreeClassifier(max_features='auto', random_state=2050624145),
 DecisionTreeClassifier(max_features='auto', random_state=1713249922),
 DecisionTreeClassifier(max_features='auto', random_state=950323429),
 DecisionTreeClassifier(max_features='auto', random_state=338733054),
 DecisionTreeClassifier(max_features='auto', random_state=527331433),
 DecisionTreeClassifier(max_features='auto', random_state=454680678),
 DecisionTreeClassifier(max_features='auto', random_state=1200893022),
 DecisionTreeClassifier(max_features='auto', random_state=860236142),
 DecisionTreeClassifier(max_features='auto', random_state=593806049),
 DecisionTreeClassifier(max_features='auto', random_state=652335096),
 DecisionTreeClassifier(max_features='auto', random_state=377415675),
 DecisionTreeClassifier(max_features='auto', random_state=187997622),
 DecisionTreeClassifier(max_features='auto', random_state=118574283),
 DecisionTreeCla

In [None]:
# The classes labels (single output problem), or a list of arrays of class labels (multi-output problem).
RFC.classes_

array([0, 1])

In [None]:
# The number of classes (single output problem), or a list containing the number of classes for each output (multi-output problem).
RFC.n_classes_

2

In [None]:
# Number of features seen during fit.
RFC.n_features_in_

48

In [None]:
# The number of outputs when fit is performed.
RFC.n_outputs_

1

In [None]:
# The impurity-based feature importances.
RFC.feature_importances_

array([0.00000000e+00, 6.29163912e-06, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 6.54711078e-06, 2.92420573e-05, 2.10502418e-05,
       3.17657620e-06, 3.56635459e-06, 2.28069301e-05, 5.50157599e-06,
       4.23302550e-05, 2.18716800e-05, 1.14420209e-04, 9.95483377e-05,
       3.56250729e-04, 3.83071752e-04, 5.96464999e-04, 1.45466271e-03,
       1.21995581e-03, 2.95849955e-03, 2.43959300e-03, 3.02653824e-03,
       3.95084848e-03, 5.85520618e-03, 7.85128754e-03, 9.65320288e-03,
       1.27181655e-02, 1.25837635e-02, 1.44794699e-02, 1.81424925e-02,
       2.15256863e-02, 2.45584171e-02, 3.15083506e-02, 3.38058083e-02,
       4.09012223e-02, 4.54583320e-02, 4.99031679e-02, 5.23109013e-02,
       6.36046992e-02, 6.62408276e-02, 6.59661990e-02, 7.10332867e-02,
       7.38701896e-02, 8.49787704e-02, 8.72152199e-02, 8.90730954e-02])

In [None]:
# Score of the training dataset obtained using an out-of-bag estimate. 
RFC.oob_score_

0.7303845298955324

In [None]:
'''
Decision function computed with out-of-bag estimate on the training set. 
If n_estimators is small it might be possible that a data point was never left out during the bootstrap. 
In this case, oob_decision_function_ might contain NaN.
'''
RFC.oob_decision_function_

array([[0.        , 1.        ],
       [0.44444444, 0.55555556],
       [0.81818182, 0.18181818],
       ...,
       [0.69230769, 0.30769231],
       [0.48484848, 0.51515152],
       [0.02702703, 0.97297297]])

In [None]:
count = 0
for each in RFC.estimators_:
  count = count + 1

print(count)

100


In [None]:
'''for each in range(0,100):
  file_name1 = 'tree_' + str(each) + '.dot'
  file_name2 = 'tree_' + str(each) + '.png'
  export_graphviz(RFC.estimators_[0], 
                out_file=file_name1, 
                filled = True)
  (graph,) = pydot.graph_from_dot_file(file_name1)
  graph.write_png(file_name2)
  source = '/content/' + file_name2
  destination =  "/content/drive/MyDrive/sem8/nlp/mini p /PNGs"
  shutil.copy(source, destination)
  print(file_name2 + ' downloaded')'''

tree_0.png downloaded
tree_1.png downloaded
tree_2.png downloaded
tree_3.png downloaded
tree_4.png downloaded
tree_5.png downloaded
tree_6.png downloaded
tree_7.png downloaded
tree_8.png downloaded
tree_9.png downloaded
tree_10.png downloaded
tree_11.png downloaded
tree_12.png downloaded
tree_13.png downloaded
tree_14.png downloaded
tree_15.png downloaded
tree_16.png downloaded
tree_17.png downloaded
tree_18.png downloaded
tree_19.png downloaded
tree_20.png downloaded
tree_21.png downloaded
tree_22.png downloaded
tree_23.png downloaded
tree_24.png downloaded
tree_25.png downloaded
tree_26.png downloaded
tree_27.png downloaded
tree_28.png downloaded
tree_29.png downloaded
tree_30.png downloaded
tree_31.png downloaded
tree_32.png downloaded
tree_33.png downloaded
tree_34.png downloaded
tree_35.png downloaded
tree_36.png downloaded
tree_37.png downloaded
tree_38.png downloaded
tree_39.png downloaded
tree_40.png downloaded
tree_41.png downloaded
tree_42.png downloaded
tree_43.png downloade