In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
import pickle

In [2]:
df = pd.read_csv('feature_vectors.csv')
x = df.values.tolist()
# convert each element from string to list
X = []
for i in range(len(x)):
    Z = []
    for j in range(len(x[i])):
        # if x[i][j] is string
        if isinstance(x[i][j], str):
            p = x[i][j].strip('][').split(', ')
            Z.append(p)
            # convert each element from string to int
            for k in range(len(p)):
                p[k] = float(p[k])
        else :
            break
    X.append(Z)

df2 = pd.read_csv('manual_markings.csv')
y = df2.values.tolist()
# convert each element from string to list
Y = []
for i in range(len(y)):
    Z = []
    for j in range(len(y[i])):
        # if x[i][j] is 0 or 1
        if y[i][j] == 0 or y[i][j] == 1:
            Z.append(int(y[i][j]))
        else: break
    Y.append(Z)

In [3]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

X_train = [item for sublist in X_train for item in sublist]
Y_train = [item for sublist in Y_train for item in sublist]
X_test = [item for sublist in X_test for item in sublist]
Y_test = [item for sublist in Y_test for item in sublist]

In [4]:
X_train_arr = np.array(X_train)
X_train_arr[np.isnan(X_train_arr)] = 0

Y_train_arr = np.array(Y_train)
Y_train_arr[np.isnan(Y_train_arr)] = 0

# Instantiate the SMOTE object
smote = SMOTE(sampling_strategy=0.5, random_state=42)

# Resample the dataset using SMOTE
X_resampled, y_resampled = smote.fit_resample(X_train_arr, Y_train_arr)

X_train = X_resampled.tolist()
Y_train = y_resampled.tolist()

In [5]:
model1 = DecisionTreeClassifier()
model1.fit(X_train, Y_train)

# save the model
pickle.dump(model1, open('decision_tree.pkl', 'wb'))

In [8]:
model2 = AdaBoostClassifier()
model2.fit(X_train, Y_train)

# save the model
pickle.dump(model2, open('ada_boost.pkl', 'wb'))

In [10]:
model3 = RandomForestClassifier()
model3.fit(X_train, Y_train)

# save the model
pickle.dump(model3, open('random_forest.pkl', 'wb'))

In [11]:
model = pickle.load(open('decision_tree.pkl', 'rb'))

X_test_arr = np.array(X_test)
X_test_arr[np.isnan(X_test_arr)] = 0
X_test = X_test_arr.tolist()

y_pred = model.predict(X_test)

print(accuracy_score(Y_test, y_pred))
print(confusion_matrix(Y_test, y_pred))
print(classification_report(Y_test, y_pred))

0.7635135135135135
[[85 24]
 [11 28]]
              precision    recall  f1-score   support

           0       0.89      0.78      0.83       109
           1       0.54      0.72      0.62        39

    accuracy                           0.76       148
   macro avg       0.71      0.75      0.72       148
weighted avg       0.79      0.76      0.77       148



In [12]:
model = pickle.load(open('ada_boost.pkl', 'rb'))

X_test_arr = np.array(X_test)
X_test_arr[np.isnan(X_test_arr)] = 0
X_test = X_test_arr.tolist()

y_pred = model.predict(X_test)

print(accuracy_score(Y_test, y_pred))
print(confusion_matrix(Y_test, y_pred))
print(classification_report(Y_test, y_pred))

0.777027027027027
[[91 18]
 [15 24]]
              precision    recall  f1-score   support

           0       0.86      0.83      0.85       109
           1       0.57      0.62      0.59        39

    accuracy                           0.78       148
   macro avg       0.71      0.73      0.72       148
weighted avg       0.78      0.78      0.78       148



In [13]:
model = pickle.load(open('random_forest.pkl', 'rb'))

X_test_arr = np.array(X_test)
X_test_arr[np.isnan(X_test_arr)] = 0
X_test = X_test_arr.tolist()

y_pred = model.predict(X_test)

print(accuracy_score(Y_test, y_pred))
print(confusion_matrix(Y_test, y_pred))
print(classification_report(Y_test, y_pred))

0.8108108108108109
[[98 11]
 [17 22]]
              precision    recall  f1-score   support

           0       0.85      0.90      0.88       109
           1       0.67      0.56      0.61        39

    accuracy                           0.81       148
   macro avg       0.76      0.73      0.74       148
weighted avg       0.80      0.81      0.81       148



In [14]:
df = pd.read_csv('indian_accent_feature_vectors.csv')
x_test = df.values.tolist()
# convert each element from string to list
X_test = []
for i in range(len(x_test)):
    Z = []
    for j in range(len(x_test[i])):
        # if x_test[i][j] is string
        if isinstance(x_test[i][j], str):
            p = x_test[i][j].strip('][').split(', ')
            Z.append(p)
            # convert each element from string to int
            for k in range(len(p)):
                p[k] = float(p[k])
        else :
            break
    X_test.append(Z)

df2 = pd.read_csv('word_markings_indian_accent.csv')
y_test = df2.values.tolist()
# convert each element from string to list
Y_test = []
for i in range(len(y_test)):
    Z = []
    for j in range(len(y_test[i])):
        # if x[i][j] is 0 or 1
        if y_test[i][j] == 0 or y_test[i][j] == 1:
            Z.append(int(y_test[i][j]))
        else: break
    Y_test.append(Z)

X_test = [item for sublist in X_test for item in sublist]
Y_test = [item for sublist in Y_test for item in sublist]

In [16]:
model = pickle.load(open('decision_tree.pkl', 'rb'))

X_test_arr = np.array(X_test)
X_test_arr[np.isnan(X_test_arr)] = 0
X_test = X_test_arr.tolist()

y_pred = model.predict(X_test)

print(classification_report(Y_test, y_pred))


Performance report of the model is :
              Precision    Recall  F1-Score   Support

            0      0.75      0.60      0.67       225
            1      0.43      0.59      0.49       112

     Accuracy                          0.60       337
    Macro_avg      0.59      0.60      0.58       337
 Weighted_avg      0.64      0.60      0.61       337


In [17]:
model = pickle.load(open('ada_boost.pkl', 'rb'))

X_test_arr = np.array(X_test)
X_test_arr[np.isnan(X_test_arr)] = 0
X_test = X_test_arr.tolist()

y_pred = model.predict(X_test)
print(classification_report(Y_test, y_pred))


Performance report of the model is :
              Precision    Recall  F1-Score   Support

            0      0.78      0.68      0.73       225
            1      0.50      0.63      0.55       113

     Accuracy                          0.66       338
    Macro_avg      0.64      0.65      0.64       338
 Weighted_avg      0.69      0.66      0.67       338


In [18]:
model = pickle.load(open('random_forest.pkl', 'rb'))

X_test_arr = np.array(X_test)
X_test_arr[np.isnan(X_test_arr)] = 0
X_test = X_test_arr.tolist()

y_pred = model.predict(X_test)


print(classification_report(Y_test, y_pred))


Performance report of the model is :
              Precision    Recall  F1-Score   Support

            0      0.77      0.73      0.75       225
            1      0.52      0.58      0.55       113

     Accuracy                          0.68       338
    Macro_avg      0.65      0.65      0.65       338
 Weighted_avg      0.69      0.68      0.68       338
