In [1]:
import pandas as pd

articles = pd.read_csv('articles.csv', index_col='Identifier')

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/liumukun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/liumukun/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Create function to process text

import string
custom_stopwords = ['party', 'agreement', 'chapter', 'article']
def text_process(text):
    stemmer = WordNetLemmatizer()
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join([i for i in nopunc if not i.isdigit()])
    nopunc =  [word.lower() for word in nopunc.split() if word not in stopwords.words('english')]
    nopunc =  [word.lower() for word in nopunc if word not in custom_stopwords]
    nopunc =  [word.lower() for word in nopunc if len(word) > 3]
    return [stemmer.lemmatize(word) for word in nopunc]

In [4]:
# Process articles

processed = articles.apply(lambda article: text_process(article), axis=1)

In [1]:
# Convert to tf-idf

from sklearn.feature_extraction.text import TfidfVectorizer

tfidfconvert = TfidfVectorizer(analyzer='word',ngram_range=(1,1)).fit([' '.join(article) for article in processed])
X_transformed = tfidfconvert.transform([' '.join(article) for article in processed])

NameError: name 'processed' is not defined

In [80]:
# Create function to map indices to words

def create_map(vocab, clusters):
    inv_map = {v: k for k, v in vocab.items()} 
    dicts = [{index: value for (index, value) in enumerate(cluster) if value > 0} for cluster in clusters]
    sorted_dicts = [{k: v for k, v in sorted(_dict.items(), key=lambda item: item[-1], reverse=True)} for _dict in dicts]
    labeled_dicts = [{inv_map[k]: v for k, v in _dict.items()} for _dict in sorted_dicts]
    first_10_pairs = [{k: _dict[k] for k in list(_dict)[:10]} for _dict in labeled_dicts]
    return first_10_pairs

In [110]:
# Cluster using K-means

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, init='k-means++', n_init=100).fit(X_transformed)

In [111]:
# Print top words in each cluster

print(create_map(tfidfconvert.vocabulary_, kmeans.cluster_centers_)[0])
print(create_map(tfidfconvert.vocabulary_, kmeans.cluster_centers_)[1])
print(create_map(tfidfconvert.vocabulary_, kmeans.cluster_centers_)[2])

{'shall': 0.06292169474192705, 'party': 0.053846239535849855, 'agreement': 0.036737808909043095, 'trade': 0.03372382226676572, 'measure': 0.030769635384145243, 'committee': 0.029785631454605548, 'information': 0.021826014150818186, 'parties': 0.021374302282514312, 'joint': 0.021108682599401886, 'request': 0.020784063847591523}
{'service': 0.16199964372517123, 'supplier': 0.09969548725109252, 'financial': 0.08001410124718303, 'investment': 0.06461323724141846, 'investor': 0.06446557741565448, 'telecommunication': 0.05512211819372553, 'shall': 0.04650619958804002, 'party': 0.04598090403023955, 'territory': 0.044903340247931925, 'enterprise': 0.0444125956588261}
{'good': 0.13590443955421883, 'custom': 0.0827907131588339, 'duty': 0.07133322186241634, 'originating': 0.06842474381022443, 'shall': 0.058925983096964035, 'product': 0.05575777490867084, 'material': 0.04741648037205108, 'party': 0.04489974875943644, 'territory': 0.04442196779035106, 'origin': 0.03873251498589244}


In [277]:
# Label articles

df = pd.DataFrame(processed, columns=['Article'])
df = df.reset_index()
df['Class 0'] = pd.Series(kmeans.labels_) == 0
df['Class 1'] = pd.Series(kmeans.labels_) == 1
df['Class 2'] = pd.Series(kmeans.labels_) == 2

In [299]:
# Label agreements

classified = df.groupby('Identifier').sum()
total = classified['Class 0']+classified['Class 1']+classified['Class 2']
classified['Class 0'] = classified['Class 0']/total
classified['Class 1'] = classified['Class 1']/total
classified['Class 2'] = classified['Class 2']/total
classified.to_csv('unsupervised.csv')