In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')

if os.path.exists('/content/drive/MyDrive/CL/datasets/positive.csv'):
    print("YES")

if os.path.exists('/content/drive/MyDrive/CL/datasets/negative.csv'):
    print("YES")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import random
import numpy as np
import warnings

warnings.filterwarnings('ignore')
random.seed(1228)
pd.set_option('display.max_colwidth', None)

%matplotlib inline

In [None]:
from pymystem3 import Mystem
import re

m = Mystem()
regex = re.compile("[А-Яа-я:=!\)\()A-z#\_\%/|]+")

def words_only(text, regex=regex):
    try:
        return " ".join(regex.findall(text))
    except:
        return ""

def lemmatize(text, mystem=m):
    try:
        return "".join(m.lemmatize(text)).strip()
    except:
        return " "

print(words_only('g;iuhoikl 7.kjh 8_7h одлжд :))'))
print(words_only('g;iuhoikl 7.kjh 87h одлжд :)) #tag'))

In [None]:
df_pos = pd.read_csv("/content/drive/MyDrive/CL/datasets/positive.csv", sep=';', header = None, usecols = [3])
df_pos.tail()


In [None]:
df_pos = pd.read_csv("/content/drive/MyDrive/CL/datasets/positive.csv", sep=';', header = None, usecols = [3])
df_pos.tail()


In [None]:
df_neg['sent'] = 'neg'
df_pos['sent'] = 'pos'
df_pos['text'] = df_pos[3]
df_neg['text'] = df_neg[3]
df = pd.concat([df_neg, df_pos])
df = df[['text', 'sent']]
%time df.text = df.text.apply(words_only)
#%time df.text = df.text.apply(lemmatize)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/CL/datasets/processed_text.csv', index_col = 0)
df.head()

In [None]:
df = pd.read_csv('/content/drive/MyDrive/CL/datasets/processed_text.csv', index_col = 0)
df.head()

In [None]:
texts = [df.text.iloc[i].split() for i in range(len(df))]
texts[0]

In [None]:
%%time
from gensim.models import Word2Vec

model = Word2Vec(texts, window=5, min_count=5, workers=4)
model.save("word2v.model")

In [None]:
#model = Word2Vec.load("word2v.model")

In [None]:
model.wv.most_similar("школа")

In [None]:
model.wv.most_similar("работа")

In [None]:
model.wv.get_vector("отпуск")

In [None]:
model.wv.most_similar("отпуск")

In [None]:
vec = (model.wv['университет'] - model.wv['студент'] + model.wv['школьник'])/3
model.wv.similar_by_vector(vec)

In [None]:
vec = (model.wv['лето'] - model.wv['жара'] + model.wv['холод'])/3
model.wv.similar_by_vector(vec)

In [None]:
model.wv.doesnt_match("ночь улица фонарь аптека".split())

In [None]:
model.wv.doesnt_match("цветок дерево кактус еда".split())

In [None]:
model.wv.doesnt_match("город время человек халява".split())

In [None]:
from sklearn.manifold import TSNE

def display_closestwords_tsnescatterplot(model, word):

    arr = np.empty((0,100), dtype='f')
    word_labels = [word]

    # get close words
    close_words = model.wv.most_similar(word)

    # add the vector for each of the closest words to the array
    arr = np.append(arr, np.array([model.wv.get_vector(word)]), axis=0)
    for wrd_score in close_words:
        # print(wrd_score)
        #
        # model.wv.get_vector(wrd_score[0])

        wrd_vector = model.wv.get_vector(wrd_score[0])

        word_labels.append(wrd_score[0])
        arr = np.append(arr, np.array([wrd_vector]), axis=0)

    # find tsne coords for 2 dimensions
    tsne = TSNE(n_components=2, random_state=0, perplexity=arr.shape[0] - 1)
    np.set_printoptions(suppress=True)

    print(arr.shape)

    Y = tsne.fit_transform(arr, )

    x_coords = Y[:, 0]
    y_coords = Y[:, 1]
    # display scatter plot
    plt.scatter(x_coords, y_coords)

    for label, x, y in zip(word_labels, x_coords, y_coords):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.xlim(x_coords.min()+0.00005, x_coords.max()+0.00005)
    plt.ylim(y_coords.min()+0.00005, y_coords.max()+0.00005)
    plt.show()

# display_closestwords_tsnescatterplot(model, 'отпуск')
display_closestwords_tsnescatterplot(model, 'жара')

In [None]:
top_words = []
from nltk import FreqDist
fd = FreqDist()
for text in texts:
    fd.update(text)
for i in fd.most_common(100):
    top_words.append(i[0])
print(top_words)

In [None]:
top_words_vec = model.wv[top_words]

In [None]:
top_words_vec.shape

In [None]:
def display_top_words_tsnescatterplot(model, top_words_vec, top_words):

    # find tsne coords for 2 dimensions
    tsne = TSNE(n_components=2, random_state=0, perplexity=top_words_vec.shape[0] - 1)
    np.set_printoptions(suppress=True)


    Y = tsne.fit_transform(top_words_vec)

    x_coords = Y[:, 0]
    y_coords = Y[:, 1]
    # display scatter plot
    plt.scatter(x_coords, y_coords)

    for label, x, y in zip(top_words, x_coords, y_coords):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.xlim(x_coords.min()+0.00005, x_coords.max()+0.00005)
    plt.ylim(y_coords.min()+0.00005, y_coords.max()+0.00005)

    plt.figure().set_figwidth(20)
    plt.show()


display_top_words_tsnescatterplot(model, top_words_vec, top_words)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

top_words_vec = model.wv[top_words]

dist = 1 - cosine_similarity(top_words_vec)

In [None]:
from scipy.cluster.hierarchy import  ward, dendrogram
linkage_matrix = ward(dist)

fig, ax = plt.subplots(figsize=(10, 100))
ax = dendrogram(linkage_matrix, orientation="bottom", labels=top_words);

plt.tick_params(\
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off')

plt.tight_layout()

plt.savefig('w2v_cluster.png', dpi=200) #save figure as ward_clusters

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# print(model.wv.get_index())

labels = [i for i in model.wv.index_to_key]
# tokens = model[labels]
tokens = model.wv[labels]

tsne_model = TSNE(init='pca',learning_rate='auto')
new_values = tsne_model.fit_transform(tokens)


In [None]:
plt.figure(figsize=(7, 5))
bgn = 9000
count = 20
vals = new_values[bgn:bgn + count]
for i in range(vals.shape[0]):
    plt.scatter(new_values[bgn + i][0],new_values[bgn + i][1])
    plt.annotate(labels[bgn + i],
                 xy=(new_values[bgn + i][0],new_values[bgn + i][1]),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')