In [145]:
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import re

from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

from my_modules import custom_f1_score


In [72]:
# Get the list of stopwords
nltk.download('stopwords')
stopwords_list = set(stopwords.words('english'))

dataset = pd.read_csv("data/training_data.tsv.gz", sep="\t", header=None)
dataset.dropna(inplace=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Andreas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [73]:
df = dataset.head(100000)

df.rename(columns={0: 'index', 1: 'title', 2: 'text', 3: 'labels'}, inplace=True)
df.drop('index', axis=1, inplace=True)

# Get the features
X  = df.drop('labels', axis=1)

# Start preprocessing the data
# We Lower case all the data
# we remove all stop words (the, a, etc)
# We remove other words that are from html (e.g <div><!div>)
X = X.applymap(lambda x: re.sub(r'<.*?>|[^\w\s]', '', x.lower())).applymap(lambda x: ' '.join([word for word in x.split() if word not in stopwords_list]))

# We will now concat title and text into one dataset
X = X["title"] + X["text"]
y = df['labels'].str.get_dummies(',')

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={0: 'index', 1: 'title', 2: 'text', 3: 'labels'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop('index', axis=1, inplace=True)


In [99]:
# We want to get all the unique labels
unique_labels = y.drop_duplicates().reset_index(drop=True)
print(f"There are {len(unique_labels)} different labels. Let's see how often each combination occurs.")
print('Ideally we want to have at least one cluster for each combination later. We will experiment with the n_clusters parameter for kmeans.')

combination_counts = y.groupby(y.columns.tolist()).size().reset_index(name='count')
combination_counts

There are 15 different labels. Let's see how often each combination occurs.
Ideally we want to have at least one cluster for each combination later. We will experiment with the n_clusters parameter for kmeans.


Unnamed: 0,css,html,javascript,jquery,count
0,0,0,0,1,24273
1,0,0,1,0,29017
2,0,0,1,1,12388
3,0,1,0,0,9667
4,0,1,0,1,1340
5,0,1,1,0,3984
6,0,1,1,1,1506
7,1,0,0,0,7268
8,1,0,0,1,1137
9,1,0,1,0,1053


In [74]:
# Initialize the TfidfVectorizer - we normalize the text into numerical values
vectorizer = TfidfVectorizer()
pca = TruncatedSVD(n_components=3)

In [75]:
# Fit and transform the text data and apply dimensionality reduction
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

X_train_pca = pca.fit_transform(X_train_vectorized)
X_test_pca = pca.transform(X_test_vectorized)

In [128]:
# Initialize the Kmeans classifier
kmeans = KMeans(n_clusters=100)

# Train the Kmeans classifier
kmeans.fit(X_train_pca)



In [129]:
# Assign labels for each centroid, take the label from the data point with the highest cosine similarity
centroids = kmeans.cluster_centers_

assigned_labels = []
for centroid in centroids:
    similarities = cosine_similarity([centroid], X_train_pca)
    closest_label_idx = np.argmax(similarities)
    assigned_label = y_train.loc[closest_label_idx].tolist()
    assigned_labels.append(assigned_label)

# Print the assigned labels for each centroid
for centroid, assigned_label in zip(centroids, assigned_labels):
    print(f"Centroid: {centroid}, Assigned Label: {assigned_label}")

Centroid: [ 0.07298068 -0.00355062 -0.00616151], Assigned Label: [0, 0, 0, 1]
Centroid: [0.18948568 0.0468476  0.02045305], Assigned Label: [0, 1, 1, 0]
Centroid: [ 0.10834305  0.05620913 -0.00669341], Assigned Label: [1, 1, 0, 1]
Centroid: [ 0.24350132 -0.07602063 -0.01101545], Assigned Label: [0, 0, 1, 0]
Centroid: [0.13618253 0.00920732 0.10941535], Assigned Label: [0, 0, 1, 0]
Centroid: [ 0.17831353  0.14342336 -0.16120559], Assigned Label: [0, 0, 0, 1]
Centroid: [ 0.10539225 -0.08780539 -0.02012135], Assigned Label: [0, 0, 0, 1]
Centroid: [ 0.1336716   0.01629226 -0.09266584], Assigned Label: [0, 1, 0, 1]
Centroid: [0.09322639 0.02363353 0.0168581 ], Assigned Label: [0, 0, 0, 1]
Centroid: [ 0.16101066  0.18428636 -0.0087691 ], Assigned Label: [0, 0, 0, 1]
Centroid: [0.199253   0.11162889 0.11500283], Assigned Label: [1, 0, 1, 0]
Centroid: [ 0.11267192 -0.03195879 -0.07675498], Assigned Label: [0, 0, 1, 0]
Centroid: [ 0.0817418  -0.04431219 -0.02100301], Assigned Label: [0, 1, 1, 0

In [130]:
list(set(tuple(x) for x in assigned_labels))

[(0, 0, 0, 1),
 (0, 0, 1, 0),
 (0, 1, 0, 1),
 (0, 1, 1, 1),
 (1, 0, 1, 1),
 (1, 1, 0, 0),
 (0, 1, 0, 0),
 (0, 1, 1, 0),
 (1, 0, 1, 0),
 (1, 0, 0, 0),
 (1, 0, 0, 1),
 (1, 1, 0, 1),
 (0, 0, 1, 1)]

In [140]:
# Make predictions on the test set
y_pred_cluster = kmeans.predict(X_test_pca)

In [141]:
# Transform cluster index to tuple
y_pred = [assigned_labels[x] for x in y_pred_cluster]

In [None]:
# Plot the data points and centroids
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_pred)
plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', c='red')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Data Points and Centroids')
plt.legend(['Data Points', 'Centroids'])

# Show the plot
plt.show()

custom_f1 = custom_f1_score(y_pred, y_test)
print(f"The F1-score of our problem is {custom_f1}")

In [146]:
custom_f1 = custom_f1_score(y_pred, y_test)
print(f"The F1-score of our problem is {custom_f1}")

AttributeError: 'list' object has no attribute 'shape'