In [None]:
import pandas as pd
training = pd.read_csv('train-small.tsv', sep="\t", header=None)
training.columns = ['label', 'text']
training = training[pd.notnull(training['text'])]
training.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=2, ngram_range=(1, 2))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_dev, y_train, y_dev = train_test_split(training['text'], training['label'], test_size=0.2, random_state = 42)
X_train_vec = vectorizer.fit_transform(X_train).toarray()

In [None]:
from sklearn.naive_bayes import GaussianNB
nb_clf = GaussianNB().fit(X_train_vec, y_train)

# Google translation of "It's a beautiful day in the neighborhood.":
# Mae'n ddiwrnod hyfryd yn y gymdogaeth.
print(nb_clf.predict(vectorizer.transform(["Mae'n ddiwrnod hyfryd yn y gymdogaeth."]).toarray()))
# Google translation of "I'm really sad my courses are all online."
# Rwy'n drist iawn bod fy nghyrsiau i gyd ar-lein.
print(nb_clf.predict(vectorizer.transform(["Rwy'n drist iawn bod fy nghyrsiau i gyd ar-lein."]).toarray()))

In [None]:
from sklearn.linear_model import LogisticRegression
# L2 regularization by default
lr_clf = LogisticRegression(random_state=42, max_iter=1000).fit(X_train_vec, y_train)

print(lr_clf.predict(vectorizer.transform(["Mae'n ddiwrnod hyfryd yn y gymdogaeth."]).toarray()))
print(lr_clf.predict(vectorizer.transform(["Rwy'n drist iawn bod fy nghyrsiau i gyd ar-lein."]).toarray()))

In [None]:
from sklearn.metrics import classification_report, accuracy_score
X_dev_vec = vectorizer.transform(X_dev).toarray()
y_guess_nb = nb_clf.predict(X_dev_vec)
print(accuracy_score(y_dev, y_guess_nb))
print(classification_report(y_dev, y_guess_nb))
y_guess_lr = lr_clf.predict(X_dev_vec)
print(accuracy_score(y_dev, y_guess_lr))
print(classification_report(y_dev, y_guess_lr))