In [2]:
# https://stackabuse.com/read-a-file-line-by-line-in-python/
# https://codedocs.xyz/annoviko/pyclustering/classpyclustering_1_1cluster_1_1kmedoids_1_1kmedoids.html#details
# https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

import warnings 
warnings.filterwarnings(action = 'ignore') 

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
  
import gensim 
from gensim.models import Word2Vec

from sklearn.decomposition import PCA
from matplotlib import pyplot

from pyclustering.cluster import kmedoids

[nltk_data] Downloading package stopwords to C:\Users\Isabela
[nltk_data]     Lago\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Isabela
[nltk_data]     Lago\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
def preprocess(line,lem): 
    stop_words = set(stopwords.words('english'))    
    data = []

    for s in sent_tokenize(line):
        temp = []
        for w in word_tokenize(s): 
            w = lem.lemmatize(w)
            if w.lower() not in stop_words and len(w) > 2: 
                temp.append(w.lower()) 
        data.append(temp)
    return data

def avgcosines(model1, vocab):
    avgcossim = []

    for w in range(len(vocab)):
        sums = 0.0
        for x in range(len(vocab)):
            sums = sums + model1.similarity(vocab[w],vocab[x])
        avgcossim.append([vocab[w],(sums/len(vocab))])
    
    return avgcossim

def getmedoids(result):
    initial_medoids = [1, 2]
    kmedoids_instance = kmedoids.kmedoids(result, initial_medoids)
    kmedoids_instance.process();
    clusters = kmedoids_instance.get_clusters()
    kmedoids_result = kmedoids_instance.get_medoids()
    return kmedoids_result

def column1(e):
    return e[1]

In [10]:
path = "C:\\Users\\Isabela Lago\\Downloads\\MovieSummaries.tar\\MovieSummaries\\plot_summaries.txt"
outf = open('C:\\Users\\Isabela Lago\\Downloads\\topicsTEST.txt','w',encoding="utf8")

lem = WordNetLemmatizer()
with open(path, "r", encoding="utf8") as filepath:  # read file
    line = filepath.readline()
    while line:
        splitline = line.split('\t')
        rawline = splitline[1]
        data = preprocess(splitline[1],lem)     # tokenize, lemmatize, stopword removal
        
        model1 = gensim.models.Word2Vec(data, min_count = 1, size = 100, window = 5)   # create model

        X = model1[model1.wv.vocab]      # k-medoids approach
        pca = PCA(n_components=2)
        result = pca.fit_transform(X)
        medoids = getmedoids(result)
        vocab = list(model1.wv.vocab)
        mostsimilarwords = []
        for x in medoids:
            mostsimilarwords.append(vocab[x])

        avgcossim = avgcosines(model1,vocab)     # cosine similarity approach
        avgcossim.sort(reverse=True, key=column1)
        
        dictionary = gensim.corpora.Dictionary(data)     #lda approach
        bow_corpus = [dictionary.doc2bow(word) for word in data]
        lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=1, id2word=dictionary, passes=2, workers=2)
        ldatopic = lda_model.print_topic(topicno=0,topn=2)
        ldatopicclean = ""
        for c in ldatopic:
            if c.isalpha() or (c.isspace() and not ldatopicclean.endswith(' ')):
                ldatopicclean += c
        
        # print results        
        outf.write(rawline) # column 1: the summary
        outf.write('\t')
        for y in mostsimilarwords: # column 2: k-medoids
            outf.write(y)
            outf.write(' ')
        outf.write('\t')
        for x in range(2): # column 3: cosine similarity
            outf.write(avgcossim[x][0])
            outf.write(' ')
        outf.write('\t')
        outf.write(ldatopicclean) # column 4: lda
        outf.write('\n')
        line = filepath.readline()
outf.close()

['prejudice', 0.09747331154843171]
['leg', 0.020116700549052118]
['manapally', 0.018411077187314124]
['underhanded', 0.021830018883759437]
['convincing', 0.023592700065440824]
['tell', 0.026574506276996212]
['comfort', 0.02226376525081585]
['accompanies', 0.021562703118219766]
['persuades', 0.020824342645742684]
['part', 0.045947569377021866]
['texas', 0.02278980586808017]
['miluska', 0.05239447302369098]
['milligan', 0.061661385028855875]
['arab', 0.021266670750409448]


KeyboardInterrupt: 