In [1]:
import pandas as pd

In [2]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

In [3]:
from tqdm.notebook import tqdm
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
def preprocess(reviews):
    new_data = []
    for text in tqdm(reviews):
        text_new = ""
        for i in nltk.word_tokenize(text.lower()):
            if i not in punctuation:
                text_new += i + " "
        new_data.append(text_new)
    return new_data

In [6]:
data = pd.read_csv("/content/train.csv")

In [11]:
print(preprocess(data['full_text'][0:10]))

  0%|          | 0/10 [00:00<?, ?it/s]

["many people have car where they live the thing they do n't know is that when you use a car alot of thing can happen like you can get in accidet or the smoke that the car has is bad to breath on if someone is walk but in vauban germany they dont have that proble because 70 percent of vauban 's families do not own cars and 57 percent sold a car to move there street parkig driveways and home garages are forbidden on the outskirts of freiburd that near the french and swiss borders you probaly wo n't see a car in vauban 's streets because they are completely `` car free '' but if some that lives in vauban that owns a car ownership is allowed but there are only two places that you can park a large garages at the edge of the development where a car owner buys a space but it not cheap to buy one they sell the space for you car for 40,000 along with a home the vauban people completed this in 2006 they said that this an example of a growing trend in europe the untile states and some where else

In [13]:
X_train_preproc = preprocess(data['full_text'])
y_train = list(data['score'])

  0%|          | 0/17307 [00:00<?, ?it/s]

In [14]:
X_train_slice, X_test_slice, y_train, y_test = train_test_split(X_train_preproc, y_train, random_state=42)

In [15]:
tfidf = TfidfVectorizer(min_df = 3)
model = LogisticRegression(verbose=1, max_iter = 1000, solver = 'sag')

In [17]:
pipe = Pipeline([("tfidf", tfidf), ("logreg", model)])

In [18]:
pipe.fit(X_train_slice, y_train)

convergence after 61 epochs took 10 seconds


In [20]:
y_pred = pipe.predict(X_test_slice)

In [21]:
print(classification_report(y_test,y_pred))
print("\n\n")
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_pred=y_pred, y_true=y_test))

              precision    recall  f1-score   support

           1       0.70      0.30      0.42       326
           2       0.54      0.52      0.53      1206
           3       0.49      0.63      0.55      1574
           4       0.46      0.47      0.47       938
           5       0.55      0.05      0.09       239
           6       0.00      0.00      0.00        44

    accuracy                           0.50      4327
   macro avg       0.46      0.33      0.34      4327
weighted avg       0.51      0.50      0.49      4327




[[ 97 188  35   6   0   0]
 [ 24 631 519  32   0   0]
 [ 14 317 992 250   1   0]
 [  3  34 451 443   7   0]
 [  0   1  42 184  12   0]
 [  0   0   0  42   2   0]]
0.502657730529235


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
model_newton = LogisticRegression(verbose=1, max_iter = 1000, solver = 'newton-cg')

In [31]:
pipe_newton = Pipeline([("tfidf", tfidf), ("logreg", model_newton)])

In [32]:
pipe_newton.fit(X_train_slice, y_train)

In [33]:
y_pred = pipe_newton.predict(X_test_slice)

In [34]:
print(classification_report(y_test,y_pred))
print("\n\n")
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_pred=y_pred, y_true=y_test))

              precision    recall  f1-score   support

           1       0.70      0.30      0.42       326
           2       0.54      0.52      0.53      1206
           3       0.49      0.63      0.55      1574
           4       0.46      0.47      0.47       938
           5       0.55      0.05      0.09       239
           6       0.00      0.00      0.00        44

    accuracy                           0.50      4327
   macro avg       0.46      0.33      0.34      4327
weighted avg       0.51      0.50      0.49      4327




[[ 97 188  35   6   0   0]
 [ 24 631 519  32   0   0]
 [ 14 317 992 250   1   0]
 [  3  34 451 443   7   0]
 [  0   1  42 184  12   0]
 [  0   0   0  42   2   0]]
0.502657730529235


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
