In [1]:
import os
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import style
sns.set(style='white')
style.use("fivethirtyeight")

In [2]:
# 导入训练集数据集
df_click = pd.read_csv('train_preliminary/click_log.csv')
df_ad = pd.read_csv('train_preliminary/ad.csv')
# df_user = pd.read_csv('train_preliminary/user.csv')
print(df_click.shape)
print(df_ad.shape)

(30082771, 4)
(2481135, 6)


In [3]:
# 导入测试集集合
test_clicks = pd.read_csv('test/click_log.csv')
test_ads = pd.read_csv('test/ad.csv')
print(test_clicks.shape)
print(test_ads.shape)

(33585512, 4)
(2618159, 6)


In [14]:
# df_ad.loc[df_ad['product_id'] == '\\N', 'product_id'] = 0
# df_ad.loc[df_ad['industry'] == '\\N', 'industry'] = 0
# test_ads.loc[test_ads['industry'] == '\\N', 'industry'] = 0
# test_ads.loc[test_ads['product_id'] == '\\N', 'product_id'] = 0

In [4]:
# 左连接训练集   左连接测试集
train_data = pd.merge(df_click,df_ad,on='creative_id',how='left')
test_data = pd.merge(test_clicks,test_ads,on='creative_id',how='left')
print(train_data.shape,test_data.shape)

(30082771, 9) (33585512, 9)


In [5]:
#  合并训练集和测试集 
merged_train_test = pd.concat([train_data,test_data])
merged_train_test.shape
merged_train_test.head(2)

Unnamed: 0,time,user_id,creative_id,click_times,ad_id,product_id,product_category,advertiser_id,industry
0,9,30920,567330,1,504423,30673,3,32638,319
1,65,30920,3072255,1,2642300,1261,2,6783,6


In [6]:
#定义聚合统计特征函数
def aggregate_features(df_):
    df = df_.copy()
    agg_func = {
        'creative_id' : ['nunique'],
        'click_times' : ['max', 'min', 'mean', 'std', 'count'],
        'ad_id' : ['nunique'],
        'product_id' : ['nunique'],
        'product_category' : ['nunique'],
        'advertiser_id' : ['nunique'],
        'industry' : ['nunique'],
        'time':['nunique']
    }
    
    agg_df = df.groupby('user_id').agg(agg_func).reset_index()
    agg_df.columns =  ['_'.join(i).strip('_') for i in agg_df.columns.values]
    
    return agg_df

In [7]:
#定义众数特征函数
def mode_features(df_, train_features):
    #creative_id_mode
    mode_df = df_.groupby('user_id')['creative_id'].agg(lambda x : stats.mode(x)[0][0]).reset_index()
    mode_df.columns = ['user_id', 'creatice_id_mode']
    train_features = pd.merge(train_features, mode_df, on='user_id', how='left')
    #ad_id_mode
    mode_df = df_.groupby('user_id')['ad_id'].agg(lambda x : stats.mode(x)[0][0]).reset_index()
    mode_df.columns = ['user_id', 'ad_id_mode']
    train_features = pd.merge(train_features, mode_df, on='user_id', how='left')
    #product_id_mode
    mode_df = df_.groupby('user_id')['product_id'].agg(lambda x : stats.mode(x)[0][0]).reset_index()
    mode_df.columns = ['user_id', 'product_id_mode']
    train_features = pd.merge(train_features, mode_df, on='user_id', how='left')
    #product_category_mode
    mode_df = df_.groupby('user_id')['product_category'].agg(lambda x : stats.mode(x)[0][0]).reset_index()
    mode_df.columns = ['user_id', 'product_category_mode']
    train_features = pd.merge(train_features, mode_df, on='user_id', how='left')
    
    #advertiser_id_mode
    mode_df = df_.groupby('user_id')['advertiser_id'].agg(lambda x : stats.mode(x)[0][0]).reset_index()
    mode_df.columns = ['user_id', 'advertiser_id_mode']
    train_features = pd.merge(train_features, mode_df, on='user_id', how='left')
    
    #industry_mode
    mode_df = df_.groupby('user_id')['industry'].agg(lambda x : stats.mode(x)[0][0]).reset_index()
    mode_df.columns = ['user_id', 'industry_mode']
    train_features = pd.merge(train_features, mode_df, on='user_id', how='left')
    
    return train_features

In [8]:
aggregate_features_df = aggregate_features(merged_train_test)

In [11]:
aggregate_features_df

Unnamed: 0,user_id,creative_id_nunique,click_times_max,click_times_min,click_times_mean,click_times_std,click_times_count,ad_id_nunique,product_id_nunique,product_category_nunique,advertiser_id_nunique,industry_nunique,time_nunique
0,1,12,2,1,1.076923,0.277350,13,12,6,3,12,9,10
1,2,42,2,1,1.022222,0.149071,45,42,20,3,36,15,28
2,3,30,1,1,1.000000,0.000000,30,30,17,6,28,8,23
3,4,29,1,1,1.000000,0.000000,29,29,18,6,26,10,15
4,5,33,2,1,1.030303,0.174078,33,33,7,4,30,18,26
5,6,16,1,1,1.000000,0.000000,17,16,6,2,13,10,14
6,7,10,1,1,1.000000,0.000000,10,10,5,4,10,8,10
7,8,48,2,1,1.061224,0.242226,49,48,25,6,37,20,34
8,9,17,2,1,1.050000,0.223607,20,17,12,2,15,5,17
9,10,9,1,1,1.000000,0.000000,10,9,5,3,8,5,10


In [12]:
aggregate_features_df.to_csv('aggregate_features_df.csv', index=False)

In [13]:
# !pip install joblib 
from joblib import Parallel,delayed
import multiprocessing
import pickle

def get_sequences_corpus(tt_df,keyword):
    print(tt_df.shape)
    tmp1 = tt_df[['time','user_id',keyword]]
    tmp = tmp1.sort_values('time')
    
    GroupBy = tmp.groupby('user_id')[keyword].apply(lambda x :x.values)
    print('finished Groupby')
    user_ids = tmp['user_id'].unique()
    user_ids = np.sort(user_ids)
    print(len(user_ids))
    print(user_ids[:10])
    corpus = []
    for uid in user_ids:
#         print(uid)
#         print(GroupBy.get(uid))
        corpus.append(GroupBy.get(uid).astype('str').tolist())
    
    with open(keyword + '_corpus.pkl','wb') as f:
        pickle.dump(corpus,f)
    print('finish save corpus')
    print('the length of corpus:', len(corpus))
    

def load_corpus(keyword):
    with open(keyword+'_corpus.pkl','rb') as f:
        corpus = pickle.load(f)
    print(len(corpus))
    return corpus

In [14]:
# !pip install gensim
from gensim.models.word2vec import Word2Vec
import multiprocessing
def train_word_embedding(corpus,_size = 128, _min_count = 1,_window = 175, key_word = 'creative_id',train = 1): #  训练词向量，返回词向量df
    print(_size,_window,_min_count)
    model = None
    if train == 1:
        model = Word2Vec(corpus,size= _size,min_count = _min_count,sg = 1, window=_window,workers = multiprocessing.cpu_count(), iter=10)
        model.save(key_word + '_embedding.model')
    else:
        model = Word2Vec.load(key_word + '_embedding.model')
    print(key_word + '中有' + str(len(model.wv.vocab))+ '个点')
    # 装入字典
    _dict = {}
    for word in model.wv.vocab.keys():
        vec = model.wv[word]
#       print(vec)
        _dict[word] = vec.tolist()
    # dict -> dataframe -> 转置
    we_df = pd.DataFrame(_dict)
    we_df_1 = pd.DataFrame(we_df.values.T, columns=we_df.index).reset_index()
    we_df_1.drop('index',axis=1,inplace = True)

    we_df_1.insert(0, key_word,we_df.columns)
    we_df_1[key_word] = we_df_1[key_word].apply(str)
#     we_df_1.info()
    return we_df_1


In [15]:
# save corpus
sequence = merged_train_test[['time','user_id','product_id']]
get_sequences_corpus(sequence,'product_id')

(63668283, 3)
finished Groupby
1900000
[ 1  2  3  4  5  6  7  8  9 10]
finish save corpus
the length of corpus: 1900000


In [16]:
# 训练 or load ad_id 的 embedding
pid_corpus = load_corpus('product_id')
pid_word_embedding = train_word_embedding(corpus=pid_corpus,_size =100, _min_count = 1,_window = 150, key_word = 'product_id',train = 1)
pid_word_embedding.head(5)
# 3027360

1900000
100 150 1
product_id中有39057个点


Unnamed: 0,product_id,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,\N,0.230449,-0.016309,-0.032462,0.246716,0.065027,-0.158462,0.063976,0.135997,-0.06436,...,0.0441,0.108972,-0.212846,-0.354464,-0.309857,-0.0473,-0.191737,0.125584,-0.217134,0.037222
1,136,0.662488,-0.151091,-0.174232,-0.040902,-0.016266,-0.107945,0.093013,0.111007,-0.135488,...,-0.008549,-0.026086,-0.190198,-0.24561,-0.14442,-0.096185,-0.021873,0.136047,-0.041248,-0.414705
2,1334,0.231772,0.12311,-0.058502,-0.090107,0.02126,-0.096627,0.074578,0.310335,0.077614,...,-0.161495,0.059424,-0.511773,-0.34293,-0.159982,0.077776,0.294551,0.012943,0.016179,0.035727
3,1454,0.103512,0.098439,-0.08849,-0.077993,0.091549,-0.188213,-0.098858,0.077935,0.014191,...,-0.001395,-0.158864,-0.567167,-0.10483,-0.326551,0.191296,0.241825,-0.005503,-0.045827,0.214234
4,64,0.388417,-0.20347,0.104972,-0.161905,0.164543,-0.419927,-0.007025,-0.154285,0.084066,...,0.603175,0.033105,-0.445034,0.126247,-0.315541,-0.002137,0.080241,0.174483,-0.401077,-0.171392


In [25]:
# 训练 or load ad_id 的 embedding
ua_corpus = load_corpus('ad_id')
ua_word_embedding = train_word_embedding(corpus=ua_corpus,_size =100, _min_count = 1,_window = 150, key_word = 'ad_id',train = 1)
ua_word_embedding.head(5)
# 3027360

1900000
100 150 1
ad_id中有3027360个点


Unnamed: 0,ad_id,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,724607,-0.385773,-0.589155,-0.510014,0.20339,-0.294017,-0.749488,0.235264,-0.016776,0.773691,...,0.024171,-0.488681,-0.561043,-0.281232,-0.331157,-0.042329,-0.574566,-0.002759,-0.151273,-0.22946
1,188507,-0.680983,0.016435,-0.288762,0.291477,-0.127756,0.054286,0.238733,0.251901,-0.26984,...,-0.013893,-0.09023,0.057228,0.414773,-0.286655,0.398024,-0.163143,0.085627,-0.245028,0.399153
2,773445,-0.325775,-0.032281,0.008332,-0.193718,0.186926,-0.094122,-0.608319,-0.227784,-0.04502,...,0.197333,-0.240588,0.339618,0.275166,-0.354037,0.306582,-0.331759,0.319663,-0.137082,-0.334939
3,1458878,-0.415269,-0.388307,-0.62262,-0.147038,-0.510759,0.042591,0.280436,-0.079508,0.249794,...,0.03775,-0.10849,0.413869,-0.226727,-0.057482,0.503483,-0.048068,0.295531,-0.225183,0.329344
4,109959,-0.002504,0.038699,-0.072784,0.205399,0.407421,0.050485,-0.033955,-0.11647,0.15491,...,-0.453947,-0.151128,0.372142,0.021832,-0.286297,0.071975,0.122007,0.137176,0.128858,-0.1758


In [5]:
# 训练 or load advertiser_id 的 embedding
uad_corpus = load_corpus('advertiser_id')
uad_word_embedding = train_word_embedding(corpus=uad_corpus,_size = 100, _min_count = 1,_window = 150, key_word = 'advertiser_id',train=1)
uad_word_embedding.head(1)

1900000
100 15 1
advertiser_id中有57870个点


Unnamed: 0,advertiser_id,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,7293,-0.104691,0.015903,-0.184233,0.183754,0.34444,-0.493895,0.18894,-0.326659,0.052126,...,-0.086219,0.050894,0.084754,-0.056196,-0.293527,-0.391693,-0.194405,0.220754,0.040084,-0.371816
1,9702,-0.331662,-0.278682,0.375037,0.166552,0.172987,0.147581,-0.163447,-0.1767,0.192965,...,-0.017621,0.251893,-0.131092,0.305089,0.094843,-0.524838,-0.155095,0.079298,0.305831,-0.278831
2,29455,-0.190413,0.003109,-0.318974,0.046734,0.041127,-0.009788,-0.107323,-0.306027,0.078365,...,-0.016108,-0.018114,-0.007624,-0.024844,0.113709,-0.103425,-0.355613,0.475975,-0.24692,-0.378134
3,14668,-0.390839,0.07044,-0.355348,0.199962,0.05483,0.003663,-0.035301,-0.230824,0.032787,...,-0.245805,0.218285,0.093591,-0.272302,0.063889,-0.076427,-0.19851,0.120277,0.181492,0.00651
4,11411,0.063317,-0.278979,-0.013099,-0.122002,-0.092589,0.022956,-0.266509,0.151448,-0.361329,...,0.220851,-0.037119,0.002807,-0.076418,0.008427,-0.069167,0.052495,0.437861,0.178336,-0.011764


In [20]:
# # 训练creative_id 的 embedding
uc_corpus = load_corpus('creative_id')
uc_word_embedding = train_word_embedding(uc_corpus,_size = 100, _min_count = 1,_window = 150, key_word = 'creative_id',train=1)
uc_word_embedding.head(1)

1900000
100 150 1
creative_id中有3412772个点


Unnamed: 0,creative_id,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,821396,-0.269684,-0.656224,0.21763,-0.060496,0.056127,0.133512,-0.07431,-0.919534,-0.212106,...,-1.137128,-0.099394,-0.299962,-0.064014,-0.168696,0.82354,0.539109,-0.076107,-0.092199,0.53629


In [9]:
# # 训练 product_category 的 embedding  # side information
up_corpus = load_corpus('product_category')
up_word_embedding = train_word_embedding(corpus = up_corpus,_size = 100, _min_count = 1,_window = 15, key_word = 'product_category',train = 1)
up_word_embedding.head(5)

1900000
100 15 1
product_category中有18个点


Unnamed: 0,product_category,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,5,-0.103636,0.148743,-0.422304,-0.485209,0.052336,0.086262,0.17266,-0.026908,-0.156475,...,-0.074139,0.125896,-0.049802,0.270399,0.43483,-0.301719,-0.660519,0.405056,0.003249,-0.016504
1,2,-0.128084,-0.079798,-0.357088,-0.679929,0.052974,-0.159124,-0.229887,0.120345,-0.329958,...,0.038606,-0.125116,-0.13507,0.192971,0.063832,-0.623519,-0.617402,-0.142033,0.490683,0.12626
2,18,-0.047594,0.033393,-0.214233,-0.740539,-0.08244,-0.214437,-0.276153,0.085503,-0.216667,...,-0.045498,-0.040119,-0.019082,0.400073,0.393043,-0.560828,-0.89048,0.366927,0.269973,-0.066372
3,8,0.299942,-0.029538,0.084795,-0.878347,-0.12125,-0.609981,-0.534536,0.103427,-0.557676,...,-0.14759,0.106715,0.212347,0.556049,0.973045,-0.629,-1.097283,0.635314,0.184735,-0.14609
4,17,-0.139701,0.054033,-0.463149,-0.570159,-0.03701,0.053094,0.026815,0.112129,-0.27891,...,-0.021069,-0.005278,-0.133863,0.189418,0.042958,-0.554619,-0.772551,1.053649,-0.180694,-0.16623


In [21]:
# # 训练 industry 的 embedding  # side information
in_corpus = load_corpus('industry')
print(in_corpus[1])
in_word_embedding = train_word_embedding(corpus = in_corpus,_size = 100, _min_count = 1,_window = 15, key_word = 'industry',train = 1)
in_word_embedding.head(5)

1900000
['318', '238', '6', '6', '47', '318', '317', '47', '6', '47', '242', '6', '6', '47', '47', '47', '6', '47', '6', '238', '5', '36', '319', '242', '6', '238', '36', '322', '6', '289', '36', '242', '21', '207', '250', '317', '238', '317', '317', '242', '317', '319', '242', '207', '328']
100 15 1
industry中有332个点


Unnamed: 0,industry,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,326,1.136113,2.631273,-1.603814,-1.033288,-0.478008,-1.222062,2.049399,7.960654,5.249063,...,4.310451,1.098739,0.415853,4.753929,-1.199111,1.439978,-1.444424,-4.265588,1.862052,-3.249988
1,6,0.377483,0.103574,-1.103577,-0.634888,0.053978,0.263496,-0.401817,2.186285,0.744974,...,0.862858,0.504502,1.242198,0.238195,2.445097,-0.843281,-3.039742,-0.723826,2.531846,-1.049928
2,106,-2.919505,-0.979881,-0.047153,-1.899598,-0.411099,0.230444,0.922951,-0.35047,-2.232259,...,2.484988,3.775793,-1.268427,0.148782,-0.652177,-1.347748,3.44487,-0.198419,3.787997,0.272746
3,\N,4.308415,3.860561,0.93065,1.927668,1.43258,2.845945,1.367319,1.76544,2.832134,...,-2.648932,-3.470037,-2.534724,2.98012,1.28905,2.939425,-0.618596,3.449397,-1.24391,-0.144597
4,73,-2.741806,-1.668029,-0.627605,0.353121,-0.982013,2.690246,-3.774492,4.587998,0.591344,...,3.424317,1.916111,-1.410281,2.573311,2.236505,2.972033,-0.278666,0.281115,1.732472,1.202738


In [35]:
# 求每个user->embedding(一对多)的平均embedding，返回 [user,averge_embedding]
def average_embedding(df, embeddings, keyword = 'creative_id' ):
    df_with_embedding = pd.merge(df, embeddings,on = keyword,how = 'left')
    average_embedding_of_user = df_with_embedding.groupby('user_id').apply(lambda x: x.iloc[:,8:].mean())
    average_embedding_of_user.head(5)
    return average_embedding_of_user

In [36]:
# uc_average = average_embedding(df = train_data_small,embeddings = uc_word_embedding,keyword = 'creative_id')
ua_average = average_embedding(df = train_data_small,embeddings = ua_word_embedding,keyword = 'ad_id')
# up_average = average_embedding(df = train_data_small,embeddings = up_word_embedding,keyword = 'product_category')
# uad_average = average_embedding(df = train_data_small,embeddings = uad_word_embedding,keyword = 'advertiser_id')
# print(uc_average.shape)

print(ua_average.shape)
# print(up_average.shape)
# print(uad_average.shape)

(900000, 128)


In [37]:
# delete the merged_list
del  merged_train_test

In [38]:
# train_embedding_features = pd.concat([ua_average,uad_average],axis =1).reset_index()

train_embedding_features = ua_average
print(train_embedding_features.shape)
train_embedding_features.head(2)

(900000, 128)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.152025,0.121167,-0.04862,0.002189,-0.358767,-0.044487,-0.368746,0.147768,-0.076578,0.00498,...,-0.056561,0.162761,0.329583,0.087586,0.085225,-0.052214,0.196376,0.227165,0.543608,-0.188884
2,0.079569,0.085393,-0.025908,-0.055146,-0.120967,0.092974,-0.488803,-0.081994,0.173521,0.112505,...,-0.070468,-0.073646,0.324124,0.073028,0.097306,-0.115681,-0.153083,0.018018,0.200274,-0.105739


In [140]:
# average_click_times = train_data_small.groupby('user_id')['click_times'].agg({'mean':'mean'})
# # average_click_times

is deprecated and will be removed in a future version
  """Entry point for launching an IPython kernel.


In [41]:
# test_uc_average = average_embedding(df = test_data_small,embeddings = uc_word_embedding,keyword = 'creative_id')
test_ua_average = average_embedding(df = test_data_small,embeddings = ua_word_embedding,keyword = 'ad_id')
# test_up_average = average_embedding(df = test_data_small,embeddings = up_word_embedding,keyword = 'product_category')
# test_uad_average = average_embedding(df = test_data_small,embeddings = uad_word_embedding,keyword = 'advertiser_id')


In [42]:

# test_embedding_features = pd.concat([test_ua_average,test_uad_average],axis =1).reset_index()
test_embedding_features = test_ua_average
test_embedding_features.head(2)# test_average_click_times = test_data_small.groupby('user_id')['click_times'].agg({'mean':'mean'})

# test_set = pd.merge(test_average_click_times,test_embedding_features,on='user_id',how = 'left').reset_index()
# # test_set.drop(['index'],axis=1,inplace = True)
# test_set.head(2)



Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3000001,0.215502,-0.060719,0.290449,-0.140671,0.190818,-0.264221,-0.156917,0.140267,-0.17056,-0.137992,...,-0.36458,-0.088151,0.401653,-0.028697,-0.173631,-0.133954,0.333872,0.158099,0.20856,-0.154788
3000002,-0.032205,0.150872,-0.047872,-0.028696,0.012004,-0.075096,-0.371498,-0.032354,0.006698,0.450078,...,-0.22663,0.081873,0.04462,-0.028533,0.032354,-0.1488,0.152773,-0.157815,0.263057,-0.069168


In [17]:
# # 1. load corpus'product_id'
import pickle
with open('product_id_corpus.pkl','rb') as f:
    _all = pickle.load(f)
print(len(_all))

from tqdm import tqdm
corpus = []
for s in tqdm(_all):
    corpus.append(' '.join(s))
print('finished')

  4%|▍         | 76995/1900000 [00:00<00:02, 769945.23it/s]

1900000


100%|██████████| 1900000/1900000 [00:02<00:00, 844782.55it/s]

finished





In [20]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(split=" ")
tokenizer.fit_on_texts(corpus)
word_index = tokenizer.word_index
print('word num = ',len(word_index))
X = tokenizer.texts_to_sequences(corpus)  # text2sequence

word num =  39057


In [21]:
import pickle 
with open('product_id_sequence.pkl','wb') as f:
    pickle.dump(X,f)

In [19]:
# # # 2. load word vectors
# # print(word_index)
from gensim.models.word2vec import Word2Vec
wmodel = Word2Vec.load('product_id_embedding.model')
word_embeddings = np.zeros((len(word_index) + 1,100))
for word in tqdm(wmodel.wv.vocab.keys()):
    if word == '\\N':
        word_embeddings[word_index['n']] = wmodel.wv['\\N']
    else:
        word_embeddings[word_index[word]] = wmodel.wv[word]
#     print(wmodel.wv[word])

with open('product_id_embedding_ini.pkl','wb') as em:
    pickle.dump(word_embeddings,em,protocol = 4)

print(word_embeddings.shape,word_embeddings[1])

100%|██████████| 39057/39057 [00:00<00:00, 319811.28it/s]

(39058, 100) [ 0.23044883 -0.01630943 -0.03246208  0.24671611  0.06502676 -0.15846227
  0.06397567  0.13599682 -0.06436037 -0.02838779  0.01434248  0.16639133
 -0.00540368 -0.12621582  0.09178828  0.25714022 -0.11634771  0.06272834
 -0.09889398 -0.03665477 -0.19223641 -0.05368068 -0.08485698 -0.14276268
  0.03558043  0.11265671 -0.12324455 -0.22877809  0.10239505  0.05244295
  0.00222947  0.09267782 -0.08292228  0.10219391 -0.03844807  0.2089213
 -0.04000506  0.19750275 -0.01347064  0.27418891  0.07990722 -0.19882905
  0.02087738  0.20387058 -0.08203249 -0.09889927 -0.14445929  0.22416835
 -0.07254054  0.14098933  0.1272143  -0.10365911 -0.11636129  0.12818548
  0.076173   -0.04820452  0.08305531 -0.01528021 -0.00428108  0.04352043
 -0.28099433  0.03300787 -0.05924641  0.03468425 -0.33502337  0.1485517
  0.38106647  0.02654032 -0.05090055  0.01308183 -0.01610269  0.10671831
  0.08627231  0.03098808 -0.22130147 -0.17802344 -0.26612797 -0.12431054
  0.06135784  0.27213898 -0.12463716 -0.




In [82]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf_vec = TfidfVectorizer()
# tfidf_matrix = tfidf_vec.fit_transform(corpus)

{'18': 8,
 '17': 7,
 '12': 2,
 '13': 3,
 '16': 6,
 '11': 1,
 '15': 5,
 '10': 0,
 '14': 4}

In [78]:
# with open('tfidf_product_category.pkl','wb') as f:
#     pickle.dump(tfidf_matrix,f)

In [79]:
# with open('tfidf_ad_id.pkl','rb') as tf:
#     ad_id_tfidf = pickle.load(tf)

# with open('tfidf_creative_id.pkl','rb') as tf:
#     creative_id_tfidf = pickle.load(tf)
    
# with open('tfidf_advertiser_id.pkl','rb') as tf:
#     advertiser_id_tfidf = pickle.load(tf)
    
# with open('tfidf_product_category.pkl','rb') as tf:
#     product_category_tfidf = pickle.load(tf)

# with open('tfidf_industry.pkl','rb') as tf:
#     industry_tfidf = pickle.load(tf)
# ad_id_tfidf.shape,creative_id_tfidf.shape,advertiser_id_tfidf.shape,product_category_tfidf.shape,industry_tfidf.shape

((1900000, 10000),
 (1900000, 10000),
 (1900000, 10000),
 (1900000, 9),
 (1900000, 322))