In [1]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer as SklearnTfIdf

from src.tf_idf.services import TfIdfVectorizer
from src.tf_idf.services.parallel_tf_idf import ParallelTfIdfVectorizer

In [2]:
df = pd.read_csv("./datasets/train.csv")
df = df[:4_000]
df = df.dropna()
df.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD,negative
2,bullying me,negative
3,leave me alone,negative
4,"Sons of ****,",negative


In [3]:
labels, label_to_str = df["sentiment"].factorize()

## Models

In [4]:
my_tf_idf = TfIdfVectorizer()
my_tf_idf.fit(df["text"])

In [5]:
sklearn_tf_idf = SklearnTfIdf()
_ = sklearn_tf_idf.fit(df["text"])

In [6]:
count_vectorizer = CountVectorizer()
_ = count_vectorizer.fit(df["text"])

In [7]:
my_tf_idf_vectors = my_tf_idf.transform(df["text"])

In [8]:
sklearn_tf_idf_vectors = sklearn_tf_idf.transform(df["text"])

In [9]:
count_vector = count_vectorizer.transform(df["text"])

## Create models

### My TF-IDF

In [10]:
accuracies = []

for _ in range(5):
    x_train__my, x_test__my, y_train__my, y_test__my = train_test_split(my_tf_idf_vectors.to_numpy(), labels, test_size=0.2, shuffle=True)

    my_tf_idf_svc = SVC()
    _ = my_tf_idf_svc.fit(x_train__my, y_train__my)

    predicted_results = my_tf_idf_svc.predict(x_test__my)
    acc = accuracy_score(predicted_results, y_test__my)
    accuracies.append(acc)
    print(f"Accuracy is: {acc}")

Accuracy is: 0.78625
Accuracy is: 0.7925
Accuracy is: 0.79625
Accuracy is: 0.80125
Accuracy is: 0.78


In [11]:
print("AVERAGE ACCURACY FOR MY VECTORIZER: ", round(sum(accuracies) / len(accuracies), 3))

AVERAGE ACCURACY FOR MY VECTORIZER:  0.791


### Count vectorizer

In [None]:
accuracies = []

for _ in range(5):
    x_train__count, x_test__count, y_train__count, y_test__count = train_test_split(count_vector.toarray(), labels, test_size=0.2, shuffle=True)

    count_svc = SVC()
    _ = count_svc.fit(x_train__count, y_train__count)

    predicted_results = count_svc.predict(x_test__count)
    acc = accuracy_score(predicted_results, y_test__count)
    accuracies.append(acc)

    print(f"Accuracy is: {acc}")

In [None]:
print("AVERAGE ACCURACY FOR COUNT VECTORIZER: ", round(sum(accuracies) / len(accuracies), 3))

### Sklearn TF-IDF

In [12]:
accuracies = []

for _ in range(5):
    x_train__sk, x_test__sk, y_train__sk, y_test__sk = train_test_split(sklearn_tf_idf_vectors.toarray(), labels, test_size=0.2, shuffle=True)

    sk_tf_idf_svc = SVC()
    _ = sk_tf_idf_svc.fit(x_train__sk, y_train__sk)

    predicted_results = sk_tf_idf_svc.predict(x_test__sk)
    acc = accuracy_score(predicted_results, y_test__sk)
    accuracies.append(acc)

    print(f"Accuracy is: {acc}")

Accuracy is: 0.8025
Accuracy is: 0.78625
Accuracy is: 0.8025
Accuracy is: 0.78875
Accuracy is: 0.81625


In [13]:
print("AVERAGE ACCURACY FOR SKLEARN COUNT TF-IDF: ", round(sum(accuracies) / len(accuracies), 3))

AVERAGE ACCURACY FOR SKLEARN COUNT TF-IDF:  0.799
