# Импорт необходимых библиотек

In [1]:
import pandas as pd
import numpy as np
import json

from compress_fasttext.models import CompressedFastTextKeyedVectors

# Импорт данных

In [2]:
df = pd.read_csv("../Task 2/train.csv", index_col=0)
df['title_cleared'] = df['title_cleared'].astype(str)

In [3]:
df.head()

Unnamed: 0,title,score_pushshift,id,url,num_comments,created_utc,selftext,author,is_self,subreddit,cleared_text,link_flair_text,num_crossposts,title_cleared,score,target,date,text_len,title_len
0,What do you Think?,1,bm70s3,https://www.reddit.com/r/relationships/comment...,12,1557331262,&amp;#x200B;\n\n&amp;#x200B;\n\n \n\n **TL;...,Dongustas,True,relationships,amp amp tl dr friends conversation relationshi...,[new],0,think,1,8,2019-05-08 19:01:02,328,1
1,"Me [26 M] with my fiancee [22 F], been togethe...",2,bm748u,https://www.reddit.com/r/relationships/comment...,5,1557331718,My fiancee and I have been together for almost...,orangeplum,True,relationships,fiancee together almost years wedding schedule...,[new],0,m fiancee f together years engaged sure want c...,2,8,2019-05-08 19:08:38,454,10
2,I [F25] am at my wits end with a good friend o...,1,bm74mc,https://www.reddit.com/r/relationships/comment...,1,1557331771,"Ok so for some background, my friend is an att...",saidhasdhxD,True,relationships,ok background friend attractive well educated ...,Non-Romantic,0,wits end good friend mine unrelentingly negati...,1,8,2019-05-08 19:09:31,315,12
3,I (26F) want to confess to him (28M) even thou...,3,bm75mj,https://www.reddit.com/r/relationships/comment...,8,1557331911,"Ok, not something new, but i'm a bit lost here...",kokoko38,True,relationships,ok something new i bit lost here so know guy l...,[new],0,want confess even though know feelings someone...,4,8,2019-05-08 19:11:51,94,8
4,I made a mistake and now Im losing my best fri...,0,bm78pe,https://www.reddit.com/r/relationships/comment...,9,1557332337,"I always try to do the right thing, but I don'...",Assistant_tothe,True,relationships,always try right thing know deal mistake i dat...,[new],0,made mistake im losing best friend,0,8,2019-05-08 19:18:57,136,6


Преоразование даты

In [4]:
df["date"] = pd.to_datetime(df['date'])
df["HOUR"] = df["date"].map(lambda x: x.hour)
df["MONTH"] = df["date"].map(lambda x: x.month)
df["WEEKDAY"] = df["date"].map(lambda x: x.weekday())

In [5]:
df = df[['title_cleared', 'score', 'cleared_text', 'num_comments', 'target', 'HOUR', 'MONTH', 'WEEKDAY']]

In [6]:
df.head()

Unnamed: 0,title_cleared,score,cleared_text,num_comments,target,HOUR,MONTH,WEEKDAY
0,think,1,amp amp tl dr friends conversation relationshi...,12,8,19,5,2
1,m fiancee f together years engaged sure want c...,2,fiancee together almost years wedding schedule...,5,8,19,5,2
2,wits end good friend mine unrelentingly negati...,1,ok background friend attractive well educated ...,1,8,19,5,2
3,want confess even though know feelings someone...,4,ok something new i bit lost here so know guy l...,8,8,19,5,2
4,made mistake im losing best friend,0,always try right thing know deal mistake i dat...,9,8,19,5,2


Кодирование временной характеристики как непрерывная переменная

In [7]:
def make_cos_list(list_, period=24):
    def make_cos(value, period=period):
        return np.cos(value*2*np.pi/period)
    return [make_cos(x) for x in list_]

In [8]:
def make_sin_list(list_, period=24):
    def make_sin(value, period=period):
        return np.sin(value*2*np.pi/period)
    return [make_sin(x) for x in list_]

In [9]:
df['sin_hour'] = make_sin_list(df['HOUR'])
df['cos_hour'] = make_cos_list(df['HOUR'])

df['cos_month'] = make_cos_list(df['MONTH'], 12)
df['sin_month'] = make_sin_list(df["MONTH"], 12)

df['cos_weekday'] = make_cos_list(df['WEEKDAY'], 7) 
df['sin_weekday'] = make_sin_list(df['WEEKDAY'], 7)

df.drop(columns=['HOUR', 'MONTH', 'WEEKDAY'], inplace=True)

Сжатие предобученной [модели](https://fasttext.cc/docs/en/crawl-vectors.html) обученной на Common Crawl и в Википедии с использованием fastText

In [10]:
# big_model = load_facebook_model('cc.en.300.bin').wv
# small_model = compress_fasttext.prune_ft_freq(big_model, pq=True)
# small_model.save('compressed.cc.en.300.bin')

Загрузка сжатой мдели

In [11]:
embs = "compressed.cc.en.300.bin"
embeddings = CompressedFastTextKeyedVectors.load(str(embs))

Усреденение вектора по входящим в текст словам

In [12]:
def embed(tokens, default_size=100):
    if not tokens:
        return np.zeros(default_size)
    embs = [embeddings[x] for x in tokens]
    return sum(embs) / len(tokens)

In [13]:
def process_record(record):
    return embed(record.split())

Генерация признаков

In [14]:
df['emb_title'] = df['title_cleared'].map(process_record)

In [15]:
%%time
df['emb_text'] = df['cleared_text'].map(str).map(process_record)

Wall time: 4min 3s


In [16]:
df.drop(columns=['title_cleared', 'cleared_text'], inplace=True)

In [17]:
df.head()

Unnamed: 0,score,num_comments,target,sin_hour,cos_hour,cos_month,sin_month,cos_weekday,sin_weekday,emb_title,emb_text
0,1,12,8,-0.965926,0.258819,-0.866025,0.5,-0.222521,0.974928,"[-0.026377800737436, 0.031197191717532966, 0.0...","[0.010187880619940673, 0.0020352929488744015, ..."
1,2,5,8,-0.965926,0.258819,-0.866025,0.5,-0.222521,0.974928,"[0.010478794348875827, -0.10610514487596119, 0...","[-0.0062868614158249295, -0.003861153425016649..."
2,1,1,8,-0.965926,0.258819,-0.866025,0.5,-0.222521,0.974928,"[-0.014719368115237016, 0.006043402803453759, ...","[-0.008690528819658067, -0.0011496505953478278..."
3,4,8,8,-0.965926,0.258819,-0.866025,0.5,-0.222521,0.974928,"[0.00485266917046247, 0.032715675408433384, 0....","[0.0075169208342334825, -0.0026545061944424227..."
4,0,9,8,-0.965926,0.258819,-0.866025,0.5,-0.222521,0.974928,"[-0.017449723841039714, -0.0983882193121, -0.0...","[0.0033536759881612464, -0.014314594422309982,..."


Форматирование данных

In [18]:
df['emb_title'] = df['emb_title'].map(lambda x: json.dumps([float(y) for y in x]))
df['emb_text'] = df['emb_text'].map(lambda x: json.dumps([float(y) for y in x]))

emb_text = pd.DataFrame(df['emb_text'].map(json.loads).to_list(),
                        columns=[f"emb_text_{i}" for i in range(300)])

emb_title = pd.DataFrame(np.array(df['emb_title'].map(json.loads).to_list()),
                         columns=[f"emb_title_{i}" for i in range(300)])

df_embedded = pd.concat([df.drop(columns=['emb_title', 'emb_text']).reset_index(drop=True),
                         emb_text, emb_title], axis=1)

In [19]:
df_embedded.head()

Unnamed: 0,score,num_comments,target,sin_hour,cos_hour,cos_month,sin_month,cos_weekday,sin_weekday,emb_text_0,...,emb_title_290,emb_title_291,emb_title_292,emb_title_293,emb_title_294,emb_title_295,emb_title_296,emb_title_297,emb_title_298,emb_title_299
0,1,12,8,-0.965926,0.258819,-0.866025,0.5,-0.222521,0.974928,0.010188,...,0.065674,-0.005625,-0.078449,0.006156,-0.005342,0.023279,-0.016699,0.035185,-0.006651,-0.001455
1,2,5,8,-0.965926,0.258819,-0.866025,0.5,-0.222521,0.974928,-0.006287,...,-0.001185,0.026326,-0.153998,0.032865,0.015729,0.03636,0.023284,0.079431,-0.063423,0.024095
2,1,1,8,-0.965926,0.258819,-0.866025,0.5,-0.222521,0.974928,-0.008691,...,0.039523,-0.00256,-0.047848,0.014236,-0.011898,0.00909,-0.014194,0.092682,-0.009166,0.007
3,4,8,8,-0.965926,0.258819,-0.866025,0.5,-0.222521,0.974928,0.007517,...,0.040959,0.024521,-0.070278,0.005593,-0.000135,0.031898,0.027061,0.065442,-0.022263,-0.006768
4,0,9,8,-0.965926,0.258819,-0.866025,0.5,-0.222521,0.974928,0.003354,...,0.056048,-0.066131,-0.126724,0.002335,0.009897,0.033748,0.040027,0.051594,-0.029816,-0.057611


# Экспорт данных

In [20]:
df_embedded.to_csv("final_embedded.csv")