In [46]:
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [47]:
train_df = pd.read_csv('train.csv', index_col='id')
test_df = pd.read_csv('test.csv', index_col='id')

In [48]:
train_df

Unnamed: 0_level_0,review,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,<p></p>not worth the money,good
1,Love magazines full of non important informati...,good
2,I have been subscribing to this magazine for y...,good
3,I was amazed at how quickly this came in the m...,good
4,...<br>I can't get Hearst to extend my subscri...,good
...,...,...
6661,Love this magazine! Informative and interesting!,good
6662,Excellent and interesting articles. Quite a lo...,good
6663,</li>not all of us have kindle fire or an ipho...,good
6664,Came exactly as expected - great price/year. ...,good


In [49]:
X_train, X_test, y_train, y_test = train_test_split(train_df['review'], train_df['label'], test_size=0.2, random_state=42)

In [50]:
cv = CountVectorizer()
cv.fit(X_train)

In [51]:
X_train_cv = cv.transform(X_train)

In [52]:
X_train_cv

<5332x6020 sparse matrix of type '<class 'numpy.int64'>'
	with 81232 stored elements in Compressed Sparse Row format>

In [53]:
X_train_cv_df = pd.DataFrame(X_train_cv.toarray(), columns=cv.get_feature_names_out())
X_train_cv_df

Unnamed: 0,00,00029you,01,03,04,06,0644923,07,08,09,...,yuck,yummy,zero,zineif,zinio,zone,zoom,zooming,zp,页面文字
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5327,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5328,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5329,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5330,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
model = MultinomialNB()

In [55]:
model.fit(X_train_cv, y_train)

In [56]:
X_test_cv = cv.transform(X_test)

In [57]:
y_pred = model.predict(X_test_cv)

In [58]:
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

Accuracy: 0.6229385307346327


In [59]:
def create_submission_file(predictions, test_df, submission_file_name="submission.csv"):
    submission_df = pd.DataFrame({'id': test_df.index, 'Target': predictions})
    submission_df.to_csv(submission_file_name, index=False)
    print(f"Submission file '{submission_file_name}' created successfully.")

In [60]:
create_submission_file(model.predict(cv.transform(test_df['review'])), test_df, 'submission.csv')

Submission file 'submission.csv' created successfully.
