In [63]:
import numpy as np
import pandas as pd
import random
import json
import sklearn
import gensim # using v.8.3.1
import nltk 
import logging
import utils
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from gensim.models import KeyedVectors
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words
from sklearn.svm import SVR
from sklearn import linear_model

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alexi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alexi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
nltk_stop_words = nltk.corpus.stopwords.words('english')
stop_words = list(set().union(nltk_stop_words, sklearn_stop_words))

In [5]:
with open(utils.DATA + 'tfidf_predicted_tags.json') as json_file:
    data = json.load(json_file)

In [6]:
corpus, target = [], []

for doc in data:
    if len(doc['normal_tags']) != 0:
        corpus.append(doc['captions'])
        target.append(doc['normal_tags'])

len(corpus)

3502

In [7]:
df = pd.DataFrame(zip(corpus, target), columns=['doc', 'tag'])

In [8]:
for x in df.columns:
    print(x)

doc
tag


In [9]:
train_data, test_data = train_test_split(df, test_size=0.1, random_state=42)

In [10]:
len(test_data)

351

In [11]:
%%time

wv = KeyedVectors.load_word2vec_format(utils.DATA + "w2v_googlenews/GoogleNews-vectors-negative300.bin.gz", binary=True)
wv.init_sims(replace=True)

Wall time: 2min 35s


First of all, we need to tokenize our documents and get a word2vec vector representation for each word. After that we're going to find the mean of all word vectors in a document so as to create one vector that is going to represent the text. And, finally, we will get the most similiar value from word2vec's vocabulary and see if this naive representation makes any sense.

In [12]:
def w2v_tokenize_tags(tag_list):
    sent = ""
    for tag in tag_list:
        sent = " ".join([sent, tag])
    tokens = w2v_tokenize_text(sent)
    return tokens

In [13]:
def w2v_tokenize_text(text):
    tokens = []
    for sent in sent_tokenize(text, language='english'):
        for word in word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            if word in stop_words:
                continue
            tokens.append(word)
    return tokens

In [14]:
def w2v_transform_word(word):
    try:
        vector = wv.syn0norm[wv.vocab[word].index]
    except Exception as exception:
        # print(f"{type(exception).__name__} occured for word '{word}'.")
        vector = np.zeros(wv.vector_size,)
    return vector

In [15]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def word_averaging_list(wv, doc_list):
    return np.vstack([word_averaging(wv, doc) for doc in doc_list])

In [16]:
test_tokenized = test_data.apply(lambda r: w2v_tokenize_text(r['doc']), axis=1).values
train_tokenized = train_data.apply(lambda r: w2v_tokenize_text(r['doc']), axis=1).values

In [17]:
y_test_tokenized = test_data.apply(lambda r: w2v_tokenize_tags(r['tag']), axis=1).values
y_train_tokenized = train_data.apply(lambda r: w2v_tokenize_tags(r['tag']), axis=1).values

In [18]:
%%time
X_train_word_average = word_averaging_list(wv, train_tokenized)
X_test_word_average = word_averaging_list(wv, test_tokenized)

  mean.append(wv.syn0norm[wv.vocab[word].index])


Wall time: 17.1 s


In [19]:
X_train_word_average.shape

(3151, 300)

In [20]:
y_train_word_average = word_averaging_list(wv, y_train_tokenized)
y_test_word_average = word_averaging_list(wv, y_test_tokenized)

  mean.append(wv.syn0norm[wv.vocab[word].index])


In [21]:
wv.most_similar(positive=[X_test_word_average[100]], restrict_vocab=100000, topn=10)

[('do', 0.6011865139007568),
 ('that', 0.582533597946167),
 ('actually', 0.5790467262268066),
 ('think', 0.5734384059906006),
 ('anyway', 0.5630047917366028),
 ('just', 0.5600370764732361),
 ('so', 0.5590255260467529),
 ('know', 0.551464319229126),
 ('but', 0.5503146052360535),
 ('guess', 0.5415164232254028)]

As we can see, the naive method isn't really helpful, since the text simlarity is close to words that are evidentally aren't important for understanding the topic. So now we're going to try something else.

## Tag prediction based on TF-IDF 

Here we're going to make an assumption that words in each text that were considered the most important ones by the TF-IDF algorithm represent our documents well enough. So we will convert top-1 words (according to TF-IDF) into w2v vectors and use them to predict one tag that we already have in our dataframe as a target value.

### Preparing data

In [22]:
df

Unnamed: 0,doc,tag
0,"When I was 15, I got my first professional aud...","[business, entertainment, social change, art, ..."
1,[SHAPE YOUR FUTURE] It’s a warm morning and I’...,"[storytelling, film, humanity, mental health, ..."
2,"""Should I do a cleanse?"" I hear people asking ...","[food, health, health care, biology, human body]"
3,[SHAPE YOUR FUTURE] We are at the beginning of...,"[culture, design, art, future, Moon, space, an..."
4,"Whitney Pennington Rodgers: Each of us, no mat...","[Social Change, Society, Humanity, Activism, M..."
...,...,...
3497,Good morning. How are you? (Audience) Good. It...,"[education, educational, system, creativity, i..."
3498,If you're here today -- and I'm very happy tha...,"[Sustainable, South, Bronx, environment, green..."
3499,With all the legitimate concerns about AIDS an...,"[Conference, preventive, medicine, diabetes, d..."
3500,This is really a two-hour presentation I give ...,"[How to succeed, secrets for success, what lea..."


In [23]:
tf_idf_data = []

for n in range(len(data)):
    if (len(data[n]['normal_tags']) != 0) & (len(data[n]['tf_idf_predicted_tags']) >= 5):
        row = {}
        for i in range(5):
            tokens = []
            line = data[n]['tf_idf_predicted_tags'][i][0]
            for word in word_tokenize(line, language='english'):
                line = line.replace(' ', '_')
            row[f'top_word_{i+1}'] = line
        line = data[n]['normal_tags'][0]
        row['target'] = line.replace(' ', '_')
        tf_idf_data.append(row)

print(f"Total length: {len(tf_idf_data)}")

Total length: 3496


In [24]:
tfidf_df = pd.DataFrame(tf_idf_data)

In [25]:
tfidf_df = tfidf_df[['top_word_1', 'target']]

In [26]:
tfidf_df.head()

Unnamed: 0,top_word_1,target
0,Latina,business
1,Adayanci,storytelling
2,liver,food
3,Moon,culture
4,lAJ,Social_Change


In [27]:
vectors = []

for col_name in tfidf_df.columns:
    col = tfidf_df.apply(lambda r: w2v_transform_word(r[col_name]), axis=1).values
    vectors.append(col)

df_temp = pd.DataFrame({'top-word': vectors[0], 'target': vectors[1]})

  vector = wv.syn0norm[wv.vocab[word].index]


In [28]:
df_temp.head()

Unnamed: 0,top-word,target
0,"[-0.05219498, -0.012211393, 0.08038586, 0.0415...","[0.0050679236, -0.023729809, -0.06200754, -0.0..."
1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.083809115, -0.054631125, -0.05711436, 0.040..."
2,"[-0.10085145, 0.06359244, -0.009875038, -0.020...","[-0.06801747, 0.061800815, -0.0621665, 0.13384..."
3,"[-0.040945165, 0.04616862, -0.031003747, -0.03...","[-0.0519085, 0.0729623, 0.05009352, 0.02504676..."
4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [29]:
data_flat = pd.DataFrame(df_temp['top-word'].to_list())
y_flat = pd.DataFrame(df_temp['target'].to_list())

In [30]:
y_short = y_flat[:][0]
data_flat['target'] = y_short

In [31]:
data_flat.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,target
0,-0.052195,-0.012211,0.080386,0.041589,0.033773,-0.041309,-0.020655,0.061406,0.045217,-0.020096,...,0.025679,-0.080386,0.000567,0.020096,0.082619,0.126161,0.025818,0.036844,0.020934,0.005068
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083809
2,-0.100851,0.063592,-0.009875,-0.02031,-0.043702,0.023532,0.072837,-0.023392,0.03838,0.049585,...,-0.073397,-0.057149,-0.010575,-0.000994,0.056029,-0.067234,-0.029695,0.100291,0.060231,-0.068017
3,-0.040945,0.046169,-0.031004,-0.036227,0.052235,-0.038081,0.010868,0.013985,0.076161,-0.024769,...,0.048865,-0.034542,0.026117,0.026623,-0.000405,-0.00535,-0.076161,0.020136,-0.057964,-0.051909
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Trying to predict one of the coordinates of the target vector

In [32]:
train_df, test_df = train_test_split(data_flat, test_size=0.1, random_state=42)

y_train = train_df['target']
X_train = train_df.drop(columns=['target'])
y_test = test_df['target']
X_test = test_df.drop(columns=['target'])

In [33]:
%%time

svr = SVR(kernel='poly', degree=4, epsilon=0.05, C=3).fit(X_train, y_train)

y_pred_train = svr.predict(X_train)
y_pred_test = svr.predict(X_test)

print(f"MSE on train data:\t{mse(y_train, y_pred_train)}")
print(f"MSE on test data:\t{mse(y_test, y_pred_test)}")

MSE on train data:	0.0013281210158799823
MSE on test data:	0.0020305063491727754
Wall time: 1.08 s


In [34]:
%%time

linreg = linear_model.LinearRegression(n_jobs=1).fit(X_train, y_train)

y_pred_train = linreg.predict(X_train)
y_pred_test = linreg.predict(X_test)

print(f"MSE on train data:\t{mse(y_train, y_pred_train)}")
print(f"MSE on test data:\t{mse(y_test, y_pred_test)}")

MSE on train data:	0.001731760454196217
MSE on test data:	0.0021557951501767077
Wall time: 270 ms


In [35]:
%%time

rfr = RandomForestRegressor(n_estimators=500, max_depth=4, min_samples_split=5, min_samples_leaf=15, random_state=1)\
                            .fit(X_train, y_train)

y_pred_train = rfr.predict(X_train)
y_pred_test = rfr.predict(X_test)

print(f"MSE on train data:\t{mse(y_train, y_pred_train)}")
print(f"MSE on test data:\t{mse(y_test, y_pred_test)}")

MSE on train data:	0.0017709467750134233
MSE on test data:	0.0019875156822344825
Wall time: 1min 20s


In [36]:
params = {'n_estimators': 500,
          'max_depth': 4,
          'min_samples_split': 5,
          'learning_rate': 0.01,
          'loss': 'squared_error',
          'random_state': 1}

In [37]:
%%time

gbr = GradientBoostingRegressor(**params).fit(X_train, y_train)

y_pred_train = gbr.predict(X_train)
y_pred_test = gbr.predict(X_test)

print(f"MSE on train data:\t{mse(y_train, y_pred_train)}")
print(f"MSE on test data:\t{mse(y_test, y_pred_test)}")

MSE on train data:	0.0013930917730346358
MSE on test data:	0.001974105617912839
Wall time: 2min 10s


### Predicting tags using Support Vector Regression

In [38]:
def get_prediction(X_train, y_train, X_test, model):
    
    model.fit(X_train, y_train)
    y_predicted = model.predict(X_test)
    
    return y_predicted

In [59]:
%%time

prediction, mse_train, mse_test = [], [], []

for i in range(len(y_flat.columns)):
    data_flat[f'target_{i}'] = y_flat[:][i]


train_df, test_df = train_test_split(data_flat, test_size=0.1, random_state=42)


for i in range(len(y_flat.columns)):
    y_train = train_df[f'target_{i}']
    X_train = train_df.iloc[:, :300]
    y_test = test_df[f'target_{i}']
    X_test = test_df.iloc[:, :300]
    
    svr = SVR(kernel='poly', degree=4, epsilon=0.05, C=3)
    # linreg = linear_model.LinearRegression(n_jobs=1)
    # gbr = GradientBoostingRegressor(**params).fit(X_train, y_train)
    
    y_pred = get_prediction(X_train, y_train, X_test, svr)

    prediction.append(y_pred)
    mse_train.append(mse(y_train, y_pred_train))
    mse_test.append(mse(y_test, y_pred_test))
    
    if len(prediction) % 10 == 0:
        print(f"Predictions done: {len(prediction)}")


y_full = pd.DataFrame(zip(*prediction))

Predictions done: 10
Predictions done: 20
Predictions done: 30
Predictions done: 40
Predictions done: 50
Predictions done: 60
Predictions done: 70
Predictions done: 80
Predictions done: 90
Predictions done: 100
Predictions done: 110
Predictions done: 120
Predictions done: 130
Predictions done: 140
Predictions done: 150
Predictions done: 160
Predictions done: 170
Predictions done: 180
Predictions done: 190
Predictions done: 200
Predictions done: 210
Predictions done: 220
Predictions done: 230
Predictions done: 240
Predictions done: 250
Predictions done: 260
Predictions done: 270
Predictions done: 280
Predictions done: 290
Predictions done: 300
Wall time: 3min 42s


In [60]:
print(f"Mean MSE on train data:\t{np.mean(mse_train)}")
print(f"Mean MSE on test data:\t{np.mean(mse_test)}")

Mean MSE on train data:	0.0031595146821644044
Mean MSE on test data:	0.0031165329031722167


In [61]:
y_test_pred, y_test_true = [], []

for i in range(y_full.shape[0]):
    y_test_pred.append(y_full.iloc[i, :].to_list())
    y_test_true.append(test_df.iloc[i, :300].to_list())

In [78]:
pred_words, true_words = [], []

for i in range(300):
    prediction = wv.most_similar(positive=[np.array(y_test_pred[i], dtype='float32')], restrict_vocab=100000, topn=1)[0][0]
    pred_words.append(prediction)
    true = tfidf_df.iloc[test_df.index[i]][1]
    true_words.append(true)

tags_comparison = pd.DataFrame(zip(pred_words, true_words), columns=['y_predicted', 'y_true'])

In [79]:
tags_comparison[41:60]

Unnamed: 0,y_predicted,y_true
41,NASA,Astronomy
42,science,Microbes
43,cancer,DNA
44,Physics,visible
45,biology,Science
46,poetry,culture
47,viruses,biology
48,Death,Transgender
49,science,Leadership
50,science,Mental_Health


Possible further steps: train my own w2v model to make the vector space fit the corpus better; improve data processing and tokenization; predict tags based on several important words according to TF-IDF, not just one; use RNN.