In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
import nltk
from sklearn.linear_model import LogisticRegression

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
data = pd.read_csv('/content/train.csv')

In [4]:
data.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [5]:
data = data.fillna('')

In [15]:
data['content'] = data['author'] + ' ' + data['title']

In [16]:
port_stem = PorterStemmer()

In [17]:
def stemming(content):
  stemmed = re.sub('[^a-zA-Z]', ' ', content)
  stemmed = stemmed.lower()
  stemmed = stemmed.split()
  stemmed = [port_stem.stem(word) for word in stemmed if not word in stopwords.words('english')]
  stemmed = ' '.join(stemmed)
  return stemmed

In [18]:
data['content'] = data['content'].apply(stemming)

In [26]:
x = data['content'].values
y = data['label'].values

In [28]:
vectorizer = TfidfVectorizer()
vectorizer.fit(x)
x = vectorizer.transform(x)

In [31]:
x_train, x_test, y_train, y_test = train_test_split(x,y ,test_size = 0.2, random_state = 2, stratify = y)

In [32]:
model = LogisticRegression()

In [33]:
model.fit(x_train, y_train)

LogisticRegression()

In [34]:
x_pred = model.predict(x_test)

In [35]:
print(accuracy_score(x_pred, y_test))
print(precision_score(x_pred, y_test))
print(recall_score(x_pred, y_test))
print(f1_score(x_pred, y_test))

0.9790865384615385
0.9932789246279404
0.9659197012138189
0.9794082840236686


In [37]:
param_grid = {
    'n_neighbors' : np.arange(1,20),
    'metric': ['euclidian', 'manhattan','minkowski']
}
grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv= 5)
grid.fit(x_train, y_train)
print('best score = ', grid.best_score_)
print('best params = ', grid.best_params_)
model = grid.best_estimator_

best score =  0.5810096153846154
best params =  {'metric': 'manhattan', 'n_neighbors': 2}


95 fits failed out of a total of 285.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
95 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/neighbors/_classification.py", line 198, in fit
    return self._fit(X, y)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/neighbors/_base.py", line 437, in _fit
    self._check_algorithm_metric()
  File "/usr/local/lib/python3.8/dist-packages/sklearn/neighbors/_base.py", line 374, in _check_algorithm_metric
    raise ValueError(
ValueError: Metric 'eucli

In [38]:
x_pred = model.predict(x_test)
print(accuracy_score(x_pred, y_test))
print(precision_score(x_pred, y_test))
print(recall_score(x_pred, y_test))
print(f1_score(x_pred, y_test))

0.5923076923076923
1.0
0.5512040222281027
0.7106789491641079
