In [None]:
# The approach is inspired by this post:
# https://www.kaggle.com/code/wpncrh/doc2vec-and-logistic-regression/notebook

In [21]:
!pip install gensim



In [22]:
import pandas as pd
import csv
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.linear_model import LinearRegression

In [25]:
#lemmas = pd.read_csv('/home/martin/ML/ML-2022-T28-P05/preprocessed_text/lemmas.csv')
# For some reason the above method truncated the lemmas list
text_lemmas = dict()
with open('/home/martin/ML/ML-2022-T28-P05/preprocessed_text/lemmas.csv', mode ='r')as file:
    lemmas_file = csv.reader(file)
    for index, line in enumerate(lemmas_file):
        if index != 0:
            text_lemmas[line[0]] = eval(line[1])

train_ids = pd.read_csv('/home/martin/ML/ML-2022-T28-P05/original_data/train.csv')
test_ids = pd.read_csv('/home/martin/ML/ML-2022-T28-P05/original_data/test.csv')
skills_df = pd.read_csv('/home/martin/ML/ML-2022-T28-P05/original_data/skills.csv')
skill_list = skills_df['skill_id'].tolist()
test_all = pd.read_csv('/home/martin/ML/ML-2022-T28-P05/data/test_texts_with_ids.csv', sep = "|", encoding = "UTF-8")
test_X = test_all["text"].tolist()

In [26]:
# Joining train set lemmas and skill names to one list
lemmas_with_skills = []
trainlist = train_ids.values.tolist()
for row in trainlist:
    text_id = row[0]
    text_skills = row[1].split(" ")
    #lemmas_with_skills.append([text_lemmas[text_id], text_skills])
    lemmas_with_skills.append([[lemma.lower() for lemma in text_lemmas[text_id]], text_skills]) # Lowercasing

    
# Making that a dataframe
train = pd.DataFrame (lemmas_with_skills, columns = ['lemmas', 'skill'])

In [27]:
train.head()

Unnamed: 0,lemmas,skill
0,"[kontrollima, töö, ja, mõõteriist, ning, isiku...",[s6]
1,[mõõtmine],[s19]
2,"[meeskonnaliige, arendama, oma, teadmine, järg...","[s15, s20, s42, s48]"
3,"[samas, ei, kaduma, ikt-oskus, kõrval, vajadus...","[s13, s9]"
4,"[kasutama, oma, igapäevatöö, arvuti, infotöötl...",[s0]


In [28]:
# Joining test set lemmas and text ids to one list
lemmas_with_skills = [[text_id, text_lemmas[text_id]] for text_id in test_ids['text_id'].tolist()]

#for text_id in test_ids['text_id'].tolist():
#    lowercased [lemma.lower() for lemma in text_lemmas[text_id]]
    
# Making that a dataframe
test = pd.DataFrame (lemmas_with_skills, columns = ['text_id', 'lemmas'])

In [29]:
test.head()

Unnamed: 0,text_id,lemmas
0,text0,"[võistlusülesanne, lahendamine, vajalik, erial..."
1,text106,"[tegemine, olema, inimene, ,, kes, olema, avat..."
2,text108,"[72%, juht, tooma, kõige, tähtsam, tegur, välj..."
3,text109,"[sagedane, muutus, tööelu, tähendama, ka, valm..."
4,text112,"[palju, olema, jutt, see, ,, et, eestlane, ole..."


In [30]:
documents = []
for index, row in train_ids.iterrows():    
    text_id = row['text_id']
    documents.append(TaggedDocument(text_lemmas[text_id], row['skills'].split(" ")))

In [31]:
doc2vec_model = Doc2Vec(vector_size=100, window=8, min_count=1, workers=4, epochs=15)
doc2vec_model.build_vocab(documents)
doc2vec_model.train(documents, total_examples=len(documents), epochs=15)

In [32]:
# Inferred test set
test_X = [doc2vec_model.infer_vector(lemmas) for lemmas in test['lemmas'].tolist()]

# Inferred train set
documents = dict

# Skills column into 50 one-hot vectors
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(sparse_output=True)
mlb = MultiLabelBinarizer()
text_skills_df = train.join(pd.DataFrame(mlb.fit_transform(train.pop('skill')),
                          columns=mlb.classes_,
                          index=train.index))

train_X = [doc2vec_model.infer_vector(lemmas) for lemmas in train['lemmas'].tolist()]

In [33]:
test_preds = dict()
test_preds_scores = dict()

for skill in skill_list:
    #print("Class =", skill)
    train_y = text_skills_df[skill]

    total = len(train_y)
    pos = train_y.value_counts()[1]
    neg = train_y.value_counts()[0]

    weight_for_0 = (1 / neg) * (total / 2.0)
    weight_for_1 = (1 / pos) * (total / 2.0)

    class_weight = {0: weight_for_0, 1: weight_for_1}
    
    lin = LinearRegression()
    lin.fit(train_X, train_y)
    
    test_preds[skill] = lin.predict(test_X)

In [34]:
skills_preds = pd.DataFrame.from_dict(test_preds)

In [35]:
df = skills_preds.apply(lambda x: x.index[x>=0.5].tolist(), 1)

In [36]:
submission = [['text_id','skills']]
predictions = df.tolist()
for i in range(len(df)):
    labels = predictions[i]
    text_id = [test_ids['text_id'].tolist()[i]]
    if len(labels) == 0:
        labels.append('s0')
    labels_string = ' '.join(labels)
    submission.append([text_id[0], labels_string])

In [37]:
import csv

with open('/home/martin/ML/ML-2022-T28-P05/submissions/Doc2Vec_LinearRegression_s0.csv', 'w', newline='') as file:
    writer = csv.writer(file, delimiter=',')
    writer.writerows(submission)