In [3]:
# install necessary dependencies
!pip install nltk
!pip install gensim



In [4]:
import nltk
import numpy as np
import pandas as pd

In [5]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
#load dataset
def loadDataset():
    f = open('data4.csv','r',encoding='unicode_escape')
    dataset = []
    for line in f.readlines():
        dataset.append(list(line.strip('\n').split(' ')))
    f.close()
    return dataset
dataset_raw = loadDataset()
print(dataset_raw[0:2])

[['Japan', ',', 'Japan', 'is', 'an', 'island', 'country', 'in', 'East', 'Asia', 'Located', 'in', 'the', 'Pacific', 'Ocean', '£¬', 'it', 'lies', 'off', 'the', 'eastern', 'coast', 'of', 'the', 'Asian', 'continent', 'and', 'stretches', 'from', 'the', 'Sea', 'of', 'Okhotsk', 'in', 'the', 'north', 'to', 'the', 'East', 'China', 'Sea', 'and', 'the', 'Philippine', 'Sea', 'in', 'the', 'south', ''], ['United', 'States', ',', 'The', 'United', 'States', 'of', 'America', '(USA)', '£¬', 'commonly', 'known', 'as', 'the', 'United', 'States', '(US', 'or', 'US)', 'or', 'America', '£¬', 'is', 'a', 'country', 'comprising', '50', 'states', '£¬', 'a', 'federal', 'district', '£¬', 'five', 'major', 'self-governing', 'territories', '£¬', 'and', 'various', 'possessions[h]', 'At', '38', 'million', 'square', 'miles', '(98', 'million', 'km2)', '£¬', 'the', 'United', 'States', 'is', 'the', "world's", 'third', 'or', 'fourth', 'largest', 'country', 'by', 'total', 'area[d]', 'and', 'is', 'slightly', 'smaller', 'than',

In [7]:
#preprocess
from nltk.corpus import wordnet as wn 

en_stop = nltk.corpus.stopwords.words('english')

en_stop = ["``","/",",.",".,",";","--",":",")","(",'"','&',"'",'),',',"','-','.,','.,"','.-',"?",">","<"]                  \
         +["0","1","2","3","4","5","6","7","8","9","10","11","12","86","1986","1987","000"]                                                      \
         +["said","say","u","v","mln","ct","net","dlrs","tonne","pct","shr","nil","company","lt","share","year","billion","price"]          \
         +en_stop

def preprocess_word(word, stopwordset):
    
    word=word.lower()
    
    if word in [",",".","£¬"]:
        return None
    
    if word in stopwordset:
        return None
    
    lemma = wn.morphy(word)
    if lemma is None:
        return word

    elif lemma in stopwordset: 
        return None
    else:
        return lemma
    

def preprocess_document(document):
    document=[preprocess_word(w, en_stop) for w in document]
    document=[w for w in document if w is not None]
    return document

def preprocess_documents(documents):
    return [preprocess_document(document) for document in documents]

In [8]:
 dataset = preprocess_documents(dataset_raw)
 print(dataset[0])

['japan', 'japan', 'island', 'country', 'east', 'asia', 'locate', 'pacific', 'ocean', 'lie', 'eastern', 'coast', 'asian', 'continent', 'stretch', 'sea', 'okhotsk', 'north', 'east', 'china', 'sea', 'philippine', 'sea', 'south', '']


In [9]:
#vectorization with tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

def transform(dataset,n_features=1000):
    pre_data=[" ".join(doc) for doc in dataset]
    vectorizer = TfidfVectorizer(max_features=200, token_pattern=u'(?u)\\b\\w+\\b' )
    X = vectorizer.fit_transform(pre_data)
    return X,vectorizer

X, vectorizer = transform(dataset)

In [10]:
#clustering with k-means
num_clusters = 5
km = KMeans(n_clusters=num_clusters, random_state = 0)
# fit
clusters = km.fit_predict(X)

for doc, cls in zip(dataset, clusters):
  if cls == 0:
    print(doc[0])

moldova
ukraine
belarus
poland
georgia


In [11]:
#evaluate with Silhouette Coefficient
from sklearn import metrics
print(metrics.silhouette_score(X, clusters, metric='euclidean'))

0.033640139409598965
