In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score

In [2]:
dataset = pd.read_csv('imdb.csv')
dataset.head()
dataset = dataset[:1000]

# Create Vectors

In [3]:
count = CountVectorizer(stop_words='english')
X = count.fit_transform(dataset['review'])

print(X.shape)

(1000, 17625)


In [4]:
count_60 = CountVectorizer(stop_words='english', max_df = 0.6)
X_60 = count_60.fit_transform(dataset['review'])
count_70 = CountVectorizer(stop_words='english', max_df = 0.7)
X_70 = count_70.fit_transform(dataset['review'])
count_80 = CountVectorizer(stop_words='english', max_df = 0.8)
X_80 = count_80.fit_transform(dataset['review'])

count_10 = CountVectorizer(stop_words='english', min_df = 0.1)
X_10 = count_10.fit_transform(dataset['review'])
count_20 = CountVectorizer(stop_words='english', min_df = 0.2)
X_20 = count_20.fit_transform(dataset['review'])
count_30 = CountVectorizer(stop_words='english', min_df = 0.3)
X_30 = count_30.fit_transform(dataset['review'])

print(X_60.shape)
print(X_70.shape)
print(X_80.shape)

print(X_10.shape)
print(X_20.shape)
print(X_30.shape)

(1000, 17624)
(1000, 17625)
(1000, 17625)
(1000, 80)
(1000, 22)
(1000, 9)


# Transform

In [5]:
tfidf = TfidfTransformer()

In [6]:
X_60_ = tfidf.fit_transform(X_60)
X_70_ = tfidf.fit_transform(X_70)
X_80_ = tfidf.fit_transform(X_80)

X_10_ = tfidf.fit_transform(X_10)
X_20_ = tfidf.fit_transform(X_20)
X_30_ = tfidf.fit_transform(X_30)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, dataset['sentiment'], test_size=0.33)

X_train_60, X_test_60, y_train_60, y_test_60 = train_test_split(X_60_, dataset['sentiment'], test_size=0.33)
X_train_70, X_test_70, y_train_70, y_test_70 = train_test_split(X_70_, dataset['sentiment'], test_size=0.33)
X_train_80, X_test_80, y_train_80, y_test_80 = train_test_split(X_80_, dataset['sentiment'], test_size=0.33)

X_train_10, X_test_10, y_train_10, y_test_10 = train_test_split(X_10_, dataset['sentiment'], test_size=0.33)
X_train_20, X_test_20, y_train_20, y_test_20 = train_test_split(X_20_, dataset['sentiment'], test_size=0.33)
X_train_30, X_test_30, y_train_30, y_test_30 = train_test_split(X_30_, dataset['sentiment'], test_size=0.33)


# Fiting

In [8]:
model = MLPClassifier(max_iter=300)
model.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=300, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [9]:
# fit max 60

model.fit(X_train_60, y_train_60)

# Score max 60

print('score: ' + str(model.score(X_test_60, y_test_60)))
print('perda: ' + str(model.loss_))

# predict max 60

predict_60 = model.predict(X_test_60)

# f1 score max 60

print('f1 score: ' + str(f1_score(y_test_60, list(predict_60), pos_label="positive")))

score: 0.8090909090909091
perda: 0.004518810196017992
f1 score: 0.821529745042493


In [10]:
# fit max 70

model.fit(X_train_70, y_train_70)

# Score max 70

print('score: ' + str(model.score(X_test_70, y_test_70)))
print('perda: ' + str(model.loss_))

# predict max 70

predict_70 = model.predict(X_test_70)

# f1 score max 70

print('f1 score: ' + str(f1_score(y_test_70, list(predict_70), pos_label="positive")))

score: 0.8212121212121212
perda: 0.004665947456594473
f1 score: 0.8259587020648967


In [11]:
# fit max 80

model.fit(X_train_80, y_train_80)

# Score max 80

print('score: ' + str(model.score(X_test_80, y_test_80)))
print('perda: ' + str(model.loss_))

# predict max 80

predict_80 = model.predict(X_test_80)

# f1 score max 80

print('f1 score: ' + str(f1_score(y_test_80, list(predict_80), pos_label="positive")))

score: 0.803030303030303
perda: 0.0045239529144014785
f1 score: 0.7962382445141065


In [12]:
# fit min 10

model.fit(X_train_10, y_train_10)

# Score min 10

print('score: ' + str(model.score(X_test_10, y_test_10)))
print('perda: ' + str(model.loss_))

# predict min 10

predict_10 = model.predict(X_test_10)

# f1 score min 10

print('f1 score: ' + str(f1_score(y_test_10, list(predict_10), pos_label="positive")))

score: 0.6787878787878788
perda: 0.09136098074860541
f1 score: 0.6666666666666666




In [16]:
# fit min 20

model.fit(X_train_20, y_train_20)

# Score min 20

print('score: ' + str(model.score(X_test_20, y_test_20)))
print('perda: ' + str(model.loss_))

# predict min 20

predict_20 = model.predict(X_test_20)

# f1 score min 20

print('f1 score: ' + str(f1_score(y_test_20, list(predict_20), pos_label="positive")))

score: 0.6212121212121212
perda: 0.3811043236093726
f1 score: 0.6290801186943621




In [17]:
# fit min 30

model.fit(X_train_30, y_train_30)

# Score min 30

print('score: ' + str(model.score(X_test_30, y_test_30)))
print('perda: ' + str(model.loss_))

# predict min 30

predict_30 = model.predict(X_test_30)

# f1 score min 30

print('f1 score: ' + str(f1_score(y_test_30, list(predict_30), pos_label="positive")))

score: 0.5515151515151515
perda: 0.6005203704818793
f1 score: 0.5286624203821656


