In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import TruncatedSVD
import re
import nltk
from nltk.corpus import stopwords

from my_modules import custom_f1_score


In [3]:
# Get the list of stopwords
nltk.download('stopwords')
stopwords_list = set(stopwords.words('english'))

dataset = pd.read_csv("data/training_data.tsv.gz", sep="\t", header=None)
dataset.dropna(inplace=True)

[nltk_data] Downloading package stopwords to C:\Users\Konstantinos
[nltk_data]     Razgkel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [53]:
df = dataset.head(1000000)

df.rename(columns={0: 'index', 1: 'title', 2: 'text', 3: 'labels'}, inplace=True)
df.drop('index', axis=1, inplace=True)

# Get the features
X  = df.drop('labels', axis=1)

# Start preprocessing the data
# We Lower case all the data
# we remove all stop words (the, a, etc)
# We remove other words that are from html (e.g <div><!div>)
X = X.applymap(lambda x: re.sub(r'<.*?>|[^\w\s]', '', x.lower())).applymap(lambda x: ' '.join([word for word in x.split() if word not in stopwords_list]))

# We will now conca title and text into one dataset
X = X["title"] + X["text"]
y = df['labels'].str.get_dummies(',')

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

prediction_stored_data = pd.DataFrame(X_test).copy()

# Initialize the TfidfVectorizer - we normalize the text into numerical values
vectorizer = TfidfVectorizer()
pca = TruncatedSVD(n_components=2)

# Fit and transform the text data and apply dimensionality reduction
X_train = pca.fit_transform(vectorizer.fit_transform(X_train))
X_test = pca.transform(vectorizer.transform(X_test))

knn = KNeighborsClassifier()

# Create a dictionary of hyperparameters to search over
param_grid = {'n_neighbors': range(1, 15)}

# Perform grid search with cross-validation
grid_search = GridSearchCV(knn, param_grid, cv=10)
grid_search.fit(X_train, y_train)

# Get the best n_neighbors value
best_n_neighbors = grid_search.best_params_['n_neighbors']
print("Best n_neighbors:", best_n_neighbors)

# Initialize the KNN classifier
knn = KNeighborsClassifier(n_neighbors=best_n_neighbors)

# Train the KNN classifier
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test)

# Make the dataframes
prediction_df = pd.DataFrame(y_pred, columns=y.columns)
prediction_df.reset_index(drop=True, inplace=True)

prediction_stored_data.reset_index(drop=True, inplace=True)

custom_f1_score = custom_f1_score(y_test, y_pred)
print(f"The F1-score of our problem is {custom_f1_score}")

# The unknown data with the prediction
final_results = pd.concat([prediction_stored_data, prediction_df], axis=1)
final_results

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={0: 'index', 1: 'title', 2: 'text', 3: 'labels'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop('index', axis=1, inplace=True)


Best n_neighbors: 13
The F1-score of our problem is 0.705675


Unnamed: 0,0,css,html,javascript,jquery
0,send another value ajax file uploadupload func...,0,0,0,1
1,possible limit user upload image fixed dimensi...,0,0,0,0
2,greying button codebehind work ietwo drop list...,0,0,0,0
3,javascript template inheritancedjango framewor...,0,0,1,1
4,implement facebook like button jquery gallerif...,0,0,0,0
...,...,...,...,...,...
19995,finding combinations javascript array valuespr...,0,0,1,1
19996,capture click event flash objectdiv onclick ev...,0,0,1,0
19997,winforms equivalent javascript settimeoutsimpl...,0,0,1,0
19998,javascript random problemvar swf1swf2swf3swf v...,0,0,1,0


In [57]:
print(y_test)

In [58]:
y_test

In [None]:
y_pred