In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from numpy import array
from numpy import asarray
from numpy import zeros
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.feature_extraction.text import  CountVectorizer, TfidfTransformer,TfidfVectorizer
from sklearn.ensemble import  RandomForestClassifier
import nltk
import re

In [None]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
data=pd.read_excel('/content/drive/MyDrive/(updated) final_data_tulu.xlsx')
data.head(20)

Unnamed: 0,Text,Annotations
0,ದೈವ ತನಿಯೆ ಪಂಡಾ ಏರ್...ಅರೆನ ಕಟ್ಲೆ ಕ್ರಮ ದಾದ....ಉಂ...,Neutral
1,"ದಯಾದೀದ್ ಡಿಲೀಟ್ ಮಲ್ಪುಲೆ,ಅಂಚನೆ ನನ ದೈವೊಲೆನ್ ಯೂಸ್ ...",Mixed Feeling
2,Last da lines powefull and sathyoda patera..️ ...,Positive
3,"ಕತೆ, ನಟನೆ, ಹಿನ್ನೆಲೆ ಸಂಗೀತ, ಛಾಯಾಗ್ರಹಣ, ಅದ್ಭುತವ...",Not Tulu
4,Hats off u all... Mai navirelisuva kshana... S...,Not Tulu
5,ಕೊರಗ ತನಿಯಾ ಸ್ವಾಮಿ ಶ್ರೀ ಕೊರಗಜ್ಜ ️,Neutral
6,ನಂಬಿನಕ್ಲೆಗ್ ಅಜ್ಜೆ ಕೈ ಬುಡಯೆರ್oದ್ ಪನ್ಪಿನ ಪಾತೆರ ಯ...,Positive
7,Waaaಎಡ್ಡೆ ರೀತಿ ಡ್ ಮಂದರ್ goosebumpsಕೊರಗಜ್ಜ,Positive
8,Wonderful story ಸ್ವಾಮಿ ಅಜ್ಜ,Positive
9,ಸ್ವಾಮಿ ಅಜ್ಜಾ.... ಕಾರ್ಣಿಕದ ಮಣ್ ದ ನಂಬಿನಕ್ಲೆನ ಕ...,Positive


In [None]:
data.shape

(7171, 2)

In [None]:
data['Annotations'].value_counts()

Positive         3164
Neutral          1212
Mixed Feeling    1201
Not Tulu          924
Negative          670
Name: Annotations, dtype: int64

In [None]:
def clean_text(text):
    text = re.sub('@[a-zA-Z0-9]*', '', text)
    text = "".join([word.lower() for word in text if word not in string.punctuation]) 
    text = "".join([word for word in text if not word.isdigit()])
    text = " ".join(word for word in text.split() if word not in stopwords)
    return text

In [None]:
# feature extraction from train set
def feature_extraction(data):
    data['clean_text'] = data['Text'].apply(lambda x:clean_text(x))
        
    data['char_count'] = data['Text'].apply(len)
    data['word_count'] = data['Text'].apply(lambda x: len(x.split()))
    data['word_density'] = data['char_count'] / (data['word_count']+1)
    data['punctuation_count'] = data['Text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
    return data

In [None]:
df=feature_extraction(data)

In [None]:
df.head()

Unnamed: 0,Text,Annotations,clean_text,char_count,word_count,word_density,punctuation_count
0,ದೈವ ತನಿಯೆ ಪಂಡಾ ಏರ್...ಅರೆನ ಕಟ್ಲೆ ಕ್ರಮ ದಾದ....ಉಂ...,Neutral,ದೈವ ತನಿಯೆ ಪಂಡಾ ಏರ್ಅರೆನ ಕಟ್ಲೆ ಕ್ರಮ ದಾದಉಂದು ಮಾತ ...,80,14,5.333333,9
1,"ದಯಾದೀದ್ ಡಿಲೀಟ್ ಮಲ್ಪುಲೆ,ಅಂಚನೆ ನನ ದೈವೊಲೆನ್ ಯೂಸ್ ...",Mixed Feeling,ದಯಾದೀದ್ ಡಿಲೀಟ್ ಮಲ್ಪುಲೆಅಂಚನೆ ನನ ದೈವೊಲೆನ್ ಯೂಸ್ ಮ...,75,10,6.818182,1
2,Last da lines powefull and sathyoda patera..️ ...,Positive,last da lines powefull sathyoda patera️ ಅಜ್ಜ,50,8,5.555556,2
3,"ಕತೆ, ನಟನೆ, ಹಿನ್ನೆಲೆ ಸಂಗೀತ, ಛಾಯಾಗ್ರಹಣ, ಅದ್ಭುತವ...",Not Tulu,ಕತೆ ನಟನೆ ಹಿನ್ನೆಲೆ ಸಂಗೀತ ಛಾಯಾಗ್ರಹಣ ಅದ್ಭುತವಾಗಿದೆ ️,53,7,6.625,4
4,Hats off u all... Mai navirelisuva kshana... S...,Not Tulu,hats u mai navirelisuva kshana swami koragajja,60,9,6.0,6


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df.Annotations, random_state=5, test_size=0.2)

In [None]:
y_train.value_counts()

Positive         2501
Neutral           984
Mixed Feeling     953
Not Tulu          750
Negative          548
Name: Annotations, dtype: int64

In [None]:
y_test.value_counts()

Positive         663
Mixed Feeling    248
Neutral          228
Not Tulu         174
Negative         122
Name: Annotations, dtype: int64

In [None]:
import regex 

def custom_analyzer(text):
    words = regex.findall(r'\w{2,}', text) # extract words of at least 2 letters
    for w in words:
        yield w

In [None]:
# characters level tf-idf
from sklearn.feature_extraction.text import  CountVectorizer, TfidfTransformer,TfidfVectorizer
#tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='word',ngram_range=(2,3))
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer=custom_analyzer,ngram_range=(2,3))
tfidf_vect_ngram_chars.fit(X_train)
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(X_train) 
xtest_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(X_test) 

  "The parameter 'ngram_range' will not be used"


In [None]:
# a=tfidf_vect_ngram_chars.get_feature_names()

# with open('/content/drive/MyDrive/tfidf.txt','w')as o:
#   for i in a:
#     o.write(i)
#     o.write('\n')




In [None]:
xtrain_tfidf_ngram_chars.shape,xtest_tfidf_ngram_chars.shape

((5736, 12940), (1435, 12940))

In [None]:
(5736, 12940)

In [None]:
# # to construct tfidf 

# count_vect = CountVectorizer()
# cv_vect= count_vect.fit(X_train)
# cv_vect_train= count_vect.transform(X_train)
# cv_vect_test= count_vect.transform(X_test)

# tfidf_transformer = TfidfTransformer()
# X_tfidf = tfidf_transformer.fit(cv_vect_train)
# X_tfidf_train = tfidf_transformer.transform(cv_vect_train)
# X_tfidf_test = tfidf_transformer.transform(cv_vect_test)

# # train set
# X_features_train = pd.concat([pd.DataFrame(X_tfidf_train.toarray()),pd.DataFrame(xtrain_tfidf_ngram_chars.toarray())], axis=1)

# #test set
# X_features_test = pd.concat([pd.DataFrame(X_tfidf_test.toarray()),pd.DataFrame(xtest_tfidf_ngram_chars.toarray())], axis=1)

In [None]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()

clf.fit(xtrain_tfidf_ngram_chars, y_train) 

y_pred = clf.predict(xtest_tfidf_ngram_chars) 

accuracy = accuracy_score(y_test, y_pred)

print("Test Accuracy:", round(accuracy*100, 4))

print("\n", classification_report(y_test, y_pred))

Test Accuracy: 51.9164

                precision    recall  f1-score   support

Mixed Feeling       0.39      0.04      0.07       248
     Negative       0.83      0.04      0.08       122
      Neutral       0.71      0.18      0.29       228
     Not Tulu       1.00      0.17      0.29       174
     Positive       0.50      1.00      0.67       663

     accuracy                           0.52      1435
    macro avg       0.69      0.28      0.28      1435
 weighted avg       0.60      0.52      0.41      1435



In [None]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=100, max_depth=None, n_jobs=-1)
rf.fit(xtrain_tfidf_ngram_chars, y_train) 

y_pred = rf.predict(xtest_tfidf_ngram_chars) 

accuracy = accuracy_score(y_test, y_pred)

print("Test Accuracy:", round(accuracy*100, 4))

print("\n", classification_report(y_test, y_pred))

Test Accuracy: 58.1185

                precision    recall  f1-score   support

Mixed Feeling       0.53      0.19      0.28       248
     Negative       0.46      0.17      0.25       122
      Neutral       0.35      0.70      0.46       228
     Not Tulu       0.83      0.28      0.42       174
     Positive       0.72      0.84      0.77       663

     accuracy                           0.58      1435
    macro avg       0.58      0.44      0.44      1435
 weighted avg       0.62      0.58      0.55      1435



In [None]:
from sklearn.linear_model import LogisticRegression
lr= LogisticRegression()
lr.fit(xtrain_tfidf_ngram_chars, y_train) 

y_pred = lr.predict(xtest_tfidf_ngram_chars) 

accuracy = accuracy_score(y_test, y_pred)

print("Test Accuracy:", round(accuracy*100, 4))

print("\n", classification_report(y_test, y_pred))

Test Accuracy: 61.6028

                precision    recall  f1-score   support

Mixed Feeling       0.47      0.25      0.33       248
     Negative       0.49      0.17      0.25       122
      Neutral       0.54      0.40      0.46       228
     Not Tulu       0.90      0.44      0.59       174
     Positive       0.63      0.96      0.76       663

     accuracy                           0.62      1435
    macro avg       0.61      0.44      0.48      1435
 weighted avg       0.61      0.62      0.57      1435



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
from sklearn import svm
lisvm = svm.LinearSVC()
lisvm.fit(xtrain_tfidf_ngram_chars, y_train) 
y_pred = lisvm.predict(xtest_tfidf_ngram_chars) 
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", round(accuracy*100, 4))
print("\n", classification_report(y_test, y_pred))

Test Accuracy: 62.5087

                precision    recall  f1-score   support

Mixed Feeling       0.41      0.29      0.34       248
     Negative       0.45      0.33      0.38       122
      Neutral       0.49      0.43      0.46       228
     Not Tulu       0.82      0.57      0.68       174
     Positive       0.69      0.89      0.78       663

     accuracy                           0.63      1435
    macro avg       0.57      0.50      0.53      1435
 weighted avg       0.61      0.63      0.60      1435



In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(xtrain_tfidf_ngram_chars, y_train) 
y_pred = dt.predict(xtest_tfidf_ngram_chars) 
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", round(accuracy*100, 4))
print("\n", classification_report(y_test, y_pred))

Test Accuracy: 53.1707

                precision    recall  f1-score   support

Mixed Feeling       0.35      0.23      0.28       248
     Negative       0.31      0.22      0.26       122
      Neutral       0.32      0.54      0.40       228
     Not Tulu       0.57      0.32      0.41       174
     Positive       0.72      0.75      0.73       663

     accuracy                           0.53      1435
    macro avg       0.45      0.41      0.42      1435
 weighted avg       0.54      0.53      0.52      1435



In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(xtrain_tfidf_ngram_chars, y_train) 
y_pred = neigh.predict(xtest_tfidf_ngram_chars) 
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", round(accuracy*100, 4))
print("\n", classification_report(y_test, y_pred))

Test Accuracy: 56.0976

                precision    recall  f1-score   support

Mixed Feeling       0.28      0.33      0.30       248
     Negative       0.35      0.29      0.32       122
      Neutral       0.40      0.34      0.37       228
     Not Tulu       0.78      0.42      0.54       174
     Positive       0.71      0.81      0.76       663

     accuracy                           0.56      1435
    macro avg       0.50      0.44      0.46      1435
 weighted avg       0.56      0.56      0.55      1435



In [None]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(random_state=1, max_iter=300)
mlp.fit(xtrain_tfidf_ngram_chars, y_train) 
y_pred = mlp.predict(xtest_tfidf_ngram_chars) 
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", round(accuracy*100, 4))
print("\n", classification_report(y_test, y_pred))

Test Accuracy: 61.115

                precision    recall  f1-score   support

Mixed Feeling       0.41      0.36      0.38       248
     Negative       0.43      0.29      0.34       122
      Neutral       0.43      0.46      0.45       228
     Not Tulu       0.77      0.56      0.65       174
     Positive       0.72      0.83      0.77       663

     accuracy                           0.61      1435
    macro avg       0.55      0.50      0.52      1435
 weighted avg       0.60      0.61      0.60      1435



In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold, cross_val_score, GridSearchCV 
from sklearn.svm import SVC

kfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#Support Vector Machine(SVM)
grid_svm = GridSearchCV(estimator = SVC(probability=True, class_weight="balanced", random_state=42),
                    param_grid = {'C': [0.01, 0.1, 1]}, 
                    cv = kfolds, verbose=1) 
grid_svm.fit(xtrain_tfidf_ngram_chars, y_train)
svm = grid_svm.predict(xtest_tfidf_ngram_chars)
print(classification_report(y_test, svm))


Fitting 5 folds for each of 3 candidates, totalling 15 fits
               precision    recall  f1-score   support

Mixed Feeling       0.36      0.47      0.41       248
     Negative       0.50      0.28      0.36       122
      Neutral       0.43      0.56      0.49       228
     Not Tulu       0.83      0.54      0.66       174
     Positive       0.80      0.77      0.78       663

     accuracy                           0.61      1435
    macro avg       0.58      0.52      0.54      1435
 weighted avg       0.64      0.61      0.62      1435

