In [1]:
import pandas as pd

df_train_sentiment = pd.read_csv("all_train_sentiment.csv")
df_train_sentiment

Unnamed: 0,description,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,1.0
1,Homelessness (or Houselessness as George Carli...,1.0
2,Brilliant over-acting by Lesley Ann Warren. Be...,1.0
3,This is easily the most underrated film inn th...,1.0
4,This is not the typical Mel Brooks film. It wa...,1.0
...,...,...
24995,"Towards the end of the movie, I felt it was to...",0.0
24996,This is the kind of movie that my enemies cont...,0.0
24997,I saw 'Descent' last night at the Stockholm Fi...,0.0
24998,Some films that you pick up for a pound turn o...,0.0


In [2]:
df_test_sentiment = pd.read_csv("all_test_sentiment.csv")
df_test_sentiment

Unnamed: 0,description,sentiment
0,I went and saw this movie last night after bei...,1.0
1,Actor turned director Bill Paxton follows up h...,1.0
2,As a recreational golfer with some knowledge o...,1.0
3,"I saw this film in a sneak preview, and it is ...",1.0
4,Bill Paxton has taken the true story of the 19...,1.0
...,...,...
24995,I occasionally let my kids watch this garbage ...,0.0
24996,When all we have anymore is pretty much realit...,0.0
24997,The basic genre is a thriller intercut with an...,0.0
24998,Four things intrigued me as to this film - fir...,0.0


In [3]:
from nltk.corpus import stopwords
from nltk.stem.snowball import EnglishStemmer
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
def stem_tokenizer(text):
    stemmer = EnglishStemmer(ignore_stopwords=True)
    words = re.sub(r"[^A-Za-z0-9\-]", " ", text).lower().split()
    words = [stemmer.stem(word) for word in words]
    return words 

In [5]:
tfidf = TfidfVectorizer(stop_words=stopwords.words('english'),
                        tokenizer=stem_tokenizer,
                        lowercase=True,
                        max_df=0.5,
                        min_df=5,
                        ngram_range=(1, 2)
                       )

In [6]:
X_train = pd.DataFrame(tfidf.fit_transform(df_train_sentiment['description']).toarray())
X_train.columns = tfidf.get_feature_names_out()
X_train



Unnamed: 0,-,- -,- 1,- 10,- 2,- 3,- 4,- 5,- 7,- 8,...,zorro,zorro fight,zu,zu warrior,zucco,zucker,zuckerman,zulu,zuniga,zwick
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24996,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24997,0.069563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24998,0.038734,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
X_test = pd.DataFrame(tfidf.transform(df_test_sentiment['description']).toarray())
X_test.columns = tfidf.get_feature_names_out()
X_test

Unnamed: 0,-,- -,- 1,- 10,- 2,- 3,- 4,- 5,- 7,- 8,...,zorro,zorro fight,zu,zu warrior,zucco,zucker,zuckerman,zulu,zuniga,zwick
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.040741,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24996,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24997,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24998,0.045600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
y_train = df_train_sentiment['sentiment']
y_test = df_test_sentiment['sentiment']

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [10]:
lgr = LogisticRegression()
lgr.fit(X_train, y_train)
y_train_pred = lgr.predict(X_train)
y_test_pred = lgr.predict(X_test)

In [11]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

         0.0       0.95      0.94      0.95     12500
         1.0       0.94      0.95      0.95     12500

    accuracy                           0.95     25000
   macro avg       0.95      0.95      0.95     25000
weighted avg       0.95      0.95      0.95     25000



In [12]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

         0.0       0.89      0.89      0.89     12500
         1.0       0.89      0.89      0.89     12500

    accuracy                           0.89     25000
   macro avg       0.89      0.89      0.89     25000
weighted avg       0.89      0.89      0.89     25000

