# Clustering

In [None]:
import pandas as pd
from pandas_profiling import ProfileReport
from sklearn.cluster import MeanShift
from sklearn.feature_extraction.text import CountVectorizer

pd.options.mode.chained_assignment = None

In [None]:
df = pd.read_pickle('stemmed_words_20201116.pkl')

## Transforming words into vectors

In [None]:
def to_text(data):
    return ' '.join(data)

In [None]:
df['tmp1'] = df.stemmed_lancaster.apply(to_text)
df['tmp2'] = df.stemmed_porter.apply(to_text)

In [None]:
corpus1 = df.tmp1.values
corpus2 = df.tmp2.values

In [None]:
vectorizer = CountVectorizer()

In [None]:
X1 = vectorizer.fit_transform(corpus1)

In [None]:
X2 = vectorizer.fit_transform(corpus1)

## Clustering based on Lancaster stemming

In [None]:
clustering = MeanShift(bandwidth=2).fit(X1.toarray())

In [None]:
labels = clustering.labels_

In [None]:
df['label_lancaster'] = labels

## Clustering based on Porter stemming

In [None]:
clustering = MeanShift(bandwidth=2).fit(X2.toarray())

In [None]:
labels = clustering.labels_

In [None]:
df['label_porter'] = labels

## Save to pickle file

In [None]:
dff = df[['text', 'language', 'views', 'conversions', 'cvr', 'label_lancaster', 'label_porter']]
dff.to_pickle('clustering_20201116.pkl')