## Подключение к базе и основные таблицы

In [1]:
! pip3 install psycopg2-binary

Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: psycopg2-binary
Successfully installed psycopg2-binary-2.9.9


In [2]:
# Создадим переменную connection_path для того чтобы подключаться к базе данных не указывая явно в коде логин и пароль от БД

config_file = "config.txt"
with open(config_file, "r") as f:
    config_data = f.readlines()

config = {}
for line in config_data:
    key, value = line.strip().split("=")
    config[key] = value
    
connection_path = f"postgresql://{config['username']}:{config['password']}@{config['host']}:{config['port']}/{config['database']}"

In [3]:
# создадим connection чтобы не загружать память целиком, а выгружать из БД потоково

from sqlalchemy import create_engine

engine = create_engine(connection_path)

connection = engine.connect().execution_options(stream_results=True)

In [4]:
import pandas as pd

# Данные по пользователям

user_info = pd.read_sql(
    """SELECT * FROM public.user_data""",
    
    con= connection
)

user_info.head()

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source
0,200,1,34,Russia,Degtyarsk,3,Android,ads
1,201,0,37,Russia,Abakan,0,Android,ads
2,202,1,17,Russia,Smolensk,4,Android,ads
3,203,0,18,Russia,Moscow,1,iOS,ads
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads


In [5]:
# Посты и топики

posts_info = pd.read_sql(
    """SELECT * FROM public.post_text_df""",
    
    con=connection
)

posts_info.head()

Unnamed: 0,post_id,text,topic
0,1,UK economy facing major risks\n\nThe UK manufa...,business
1,2,Aids and climate top Davos agenda\n\nClimate c...,business
2,3,Asian quake hits European shares\n\nShares in ...,business
3,4,India power shares jump on debut\n\nShares in ...,business
4,5,Lacroix label bought by US firm\n\nLuxury good...,business


In [6]:
# Почти 77 миллионов записей, в связи с ограничением по памяти будем работать не со всем датасетом

count_feed_data = pd.read_sql(
    """SELECT count(*) FROM public.feed_data""",
    
    con=connection
)

count_feed_data.head()

Unnamed: 0,count
0,76892800


In [7]:
# Попробуем забрать, скажем, 6 миллионов

feed_data = pd.read_sql(
    """SELECT * FROM public.feed_data LIMIT 6000000""",
    
    con=connection
)

feed_data.head()

Unnamed: 0,timestamp,user_id,post_id,action,target
0,2021-11-11 20:42:09,85355,1504,view,0
1,2021-11-11 20:42:19,85355,1727,view,0
2,2021-11-15 19:09:40,85355,93,view,0
3,2021-11-15 19:10:10,85355,3463,view,1
4,2021-11-15 19:11:26,85355,3463,like,0


## Работа с данными и фичи для контентной модели

In [8]:
"""
Данные устроены так что если человек посмотрел пост и не лайкнул его, то: 
action = view и target = 0, а если лайкнул его то:
action = view и target = 1 + сразу же строкой ниже action = like и target = 0
Кажется, что информация с action = like излишняя, поэтому просто удалим ее из датасета и будем ориентироваться на 
колоонку target = [0,1]
"""

# Почистим данные от action != view

feed_data = feed_data[feed_data.action=='view']

feed_data.head()

Unnamed: 0,timestamp,user_id,post_id,action,target
0,2021-11-11 20:42:09,85355,1504,view,0
1,2021-11-11 20:42:19,85355,1727,view,0
2,2021-11-15 19:09:40,85355,93,view,0
3,2021-11-15 19:10:10,85355,3463,view,1
5,2021-11-15 19:11:28,85355,4105,view,0


Будем использозвать **контентный подход**: для любой пары (user_id, post_id) + дате timestamp 
предсказывать, случится лайк или нет.

Будем использовать модель, которая умеет предсказывать вероятности лайка

#### Работа с данными по пользователям. Создадим несколько дополнительных признаков.

In [9]:
user_info

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source
0,200,1,34,Russia,Degtyarsk,3,Android,ads
1,201,0,37,Russia,Abakan,0,Android,ads
2,202,1,17,Russia,Smolensk,4,Android,ads
3,203,0,18,Russia,Moscow,1,iOS,ads
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads
...,...,...,...,...,...,...,...,...
163200,168548,0,36,Russia,Kaliningrad,4,Android,organic
163201,168549,0,18,Russia,Tula,2,Android,organic
163202,168550,1,41,Russia,Yekaterinburg,4,Android,organic
163203,168551,0,38,Russia,Moscow,3,iOS,organic


In [10]:
# Средний возраст по городу
def users_average_age_per_city(df):
    av_age = df.groupby('city')['age'].mean()
    df['av_age_per_city'] = df['city'].map(av_age)
    return df

In [11]:
# Количество человек в городе
def count_users_in_city(df):
    count = df['city'].value_counts()  # Считаем число пользователей в каждом городе
    df['users_in_city'] = df['city'].map(count)  # Map'им к DataFrame
    return df

In [12]:
def user_feature_creation(df):
    df = count_users_in_city(df)
    df = users_average_age_per_city(df)
    
    return df

In [13]:
user_info = user_feature_creation(user_info)

In [14]:
user_info

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source,users_in_city,av_age_per_city
0,200,1,34,Russia,Degtyarsk,3,Android,ads,20,25.600000
1,201,0,37,Russia,Abakan,0,Android,ads,241,27.672199
2,202,1,17,Russia,Smolensk,4,Android,ads,434,26.822581
3,203,0,18,Russia,Moscow,1,iOS,ads,21874,27.223553
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads,105,25.809524
...,...,...,...,...,...,...,...,...,...,...
163200,168548,0,36,Russia,Kaliningrad,4,Android,organic,569,26.994728
163201,168549,0,18,Russia,Tula,2,Android,organic,628,26.869427
163202,168550,1,41,Russia,Yekaterinburg,4,Android,organic,1924,27.056133
163203,168551,0,38,Russia,Moscow,3,iOS,organic,21874,27.223553


#### Работа с данными по постам. Создадим несколько дополнительных признаков.

In [15]:
posts_info

Unnamed: 0,post_id,text,topic
0,1,UK economy facing major risks\n\nThe UK manufa...,business
1,2,Aids and climate top Davos agenda\n\nClimate c...,business
2,3,Asian quake hits European shares\n\nShares in ...,business
3,4,India power shares jump on debut\n\nShares in ...,business
4,5,Lacroix label bought by US firm\n\nLuxury good...,business
...,...,...,...
7018,7315,"OK, I would not normally watch a Farrelly brot...",movie
7019,7316,I give this movie 2 stars purely because of it...,movie
7020,7317,I cant believe this film was allowed to be mad...,movie
7021,7318,The version I saw of this film was the Blockbu...,movie


Создадим векторные представления текста после его предварительной обработки, включающей стемминг и 
исключение ненужных сиволов, а также воспользуемся TF-IDF векторизацией.

In [16]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:
import re
import string
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

ps = PorterStemmer()

def preprocessing(line, stemmer=ps):
    line = line.lower()
    line = re.sub(r"[{}]".format(string.punctuation), " ", line)
    line = line.replace('\n\n', ' ').replace('\n', ' ')
    line = ' '.join([stemmer.stem(x) for x in line.split(' ')])
    return line

tfidf = TfidfVectorizer(
    stop_words='english',
    preprocessor=preprocessing
)


In [18]:
tfidf_data = (
    tfidf
    .fit_transform(posts_info['text'])
    .toarray()
)

tfidf_data



array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.13826577, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.05102525, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [19]:
tfidf_data = pd.DataFrame(
    tfidf_data,
    index=posts_info.post_id,
    columns=tfidf.get_feature_names_out()
)

tfidf_data

Unnamed: 0_level_0,00,000,0001,000bn,000m,000th,001,001and,001st,002,...,𝓫𝓮,𝓫𝓮𝓽𝓽𝓮𝓻,𝓬𝓸𝓾𝓻𝓽𝓼,𝓱𝓮𝓪𝓻𝓲𝓷𝓰,𝓶𝓪𝔂,𝓹𝓱𝔂𝓼𝓲𝓬𝓪𝓵,𝓼𝓸𝓸𝓷𝓮𝓻,𝓼𝓾𝓫𝓸𝓻𝓭𝓲𝓷𝓪𝓽𝓮,𝓽𝓱𝓮,𝓽𝓸
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.138266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.051025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7315,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7316,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7317,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7318,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Далее сгенерируем фичи на основе TfIdf

In [20]:
posts_info['TotalTfIdf'] = tfidf_data.sum(axis=1).reset_index()[0]
posts_info['MaxTfIdf'] = tfidf_data.max(axis=1).reset_index()[0]
posts_info['MeanTfIdf'] = tfidf_data.mean(axis=1).reset_index()[0]

posts_info.head()

Unnamed: 0,post_id,text,topic,TotalTfIdf,MaxTfIdf,MeanTfIdf
0,1,UK economy facing major risks\n\nThe UK manufa...,business,8.512769,0.501487,0.000237
1,2,Aids and climate top Davos agenda\n\nClimate c...,business,11.722547,0.320992,0.000327
2,3,Asian quake hits European shares\n\nShares in ...,business,12.368293,0.250983,0.000345
3,4,India power shares jump on debut\n\nShares in ...,business,6.542363,0.551521,0.000182
4,5,Lacroix label bought by US firm\n\nLuxury good...,business,6.208541,0.431124,0.000173


Далее попытаемся разделить тексты на разные классы. Используем для этого сначала PCA, чтобы сократить размеры векторных представлений, а потом найдем расстояния до центра каждого из кластеров для каждого текста и используем это расстояние как новый признак.
Количество кластеров оценили по Elbow Method, и небольшой перегиб в инерции наблюдали на 15-16 классах.
Для PCA возьмем размерность представления равную 20

In [21]:
# Кластеризуем тексты

from sklearn.decomposition import PCA

centered = tfidf_data - tfidf_data.mean()

pca = PCA(n_components=20)
pca_decomp = pca.fit_transform(centered)

from sklearn.cluster import KMeans

n_clusters=15

kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(pca_decomp)

posts_info['TextCluster'] = kmeans.labels_

dists_columns = [f'DistanceToCluster_{i}' for i in range(1, n_clusters + 1)]

dists_df = pd.DataFrame(
    data=kmeans.transform(pca_decomp),
    columns=dists_columns
)

dists_df.head()



Unnamed: 0,DistanceToCluster_1,DistanceToCluster_2,DistanceToCluster_3,DistanceToCluster_4,DistanceToCluster_5,DistanceToCluster_6,DistanceToCluster_7,DistanceToCluster_8,DistanceToCluster_9,DistanceToCluster_10,DistanceToCluster_11,DistanceToCluster_12,DistanceToCluster_13,DistanceToCluster_14,DistanceToCluster_15
0,0.477947,0.526779,0.454623,0.516541,0.127957,0.420935,0.458389,0.459616,0.546805,0.378606,0.50075,0.453681,0.548903,0.559437,0.55292
1,0.352508,0.406478,0.305963,0.40321,0.280582,0.189609,0.329508,0.316152,0.43368,0.232752,0.371178,0.318482,0.246362,0.452019,0.41565
2,0.369793,0.414201,0.338084,0.410393,0.133574,0.294855,0.347646,0.329116,0.448455,0.181541,0.381643,0.325653,0.492002,0.482147,0.456298
3,0.310041,0.379492,0.271501,0.321113,0.255126,0.224788,0.304738,0.28197,0.399231,0.076124,0.343288,0.256555,0.442078,0.441563,0.422416
4,0.242283,0.326931,0.238108,0.308893,0.307928,0.145489,0.245488,0.203756,0.358684,0.102479,0.286952,0.167044,0.402372,0.404932,0.390496


In [22]:
posts_info = pd.concat((posts_info,dists_df), axis=1)

posts_info.head()

Unnamed: 0,post_id,text,topic,TotalTfIdf,MaxTfIdf,MeanTfIdf,TextCluster,DistanceToCluster_1,DistanceToCluster_2,DistanceToCluster_3,...,DistanceToCluster_6,DistanceToCluster_7,DistanceToCluster_8,DistanceToCluster_9,DistanceToCluster_10,DistanceToCluster_11,DistanceToCluster_12,DistanceToCluster_13,DistanceToCluster_14,DistanceToCluster_15
0,1,UK economy facing major risks\n\nThe UK manufa...,business,8.512769,0.501487,0.000237,4,0.477947,0.526779,0.454623,...,0.420935,0.458389,0.459616,0.546805,0.378606,0.50075,0.453681,0.548903,0.559437,0.55292
1,2,Aids and climate top Davos agenda\n\nClimate c...,business,11.722547,0.320992,0.000327,5,0.352508,0.406478,0.305963,...,0.189609,0.329508,0.316152,0.43368,0.232752,0.371178,0.318482,0.246362,0.452019,0.41565
2,3,Asian quake hits European shares\n\nShares in ...,business,12.368293,0.250983,0.000345,4,0.369793,0.414201,0.338084,...,0.294855,0.347646,0.329116,0.448455,0.181541,0.381643,0.325653,0.492002,0.482147,0.456298
3,4,India power shares jump on debut\n\nShares in ...,business,6.542363,0.551521,0.000182,9,0.310041,0.379492,0.271501,...,0.224788,0.304738,0.28197,0.399231,0.076124,0.343288,0.256555,0.442078,0.441563,0.422416
4,5,Lacroix label bought by US firm\n\nLuxury good...,business,6.208541,0.431124,0.000173,9,0.242283,0.326931,0.238108,...,0.145489,0.245488,0.203756,0.358684,0.102479,0.286952,0.167044,0.402372,0.404932,0.390496


In [23]:
# Воспроизведем итоговый датафрейм со всеми новыми фичами 

df = pd.merge(feed_data,
              posts_info,
              on='post_id',
              how='left')

df.head()

Unnamed: 0,timestamp,user_id,post_id,action,target,text,topic,TotalTfIdf,MaxTfIdf,MeanTfIdf,...,DistanceToCluster_6,DistanceToCluster_7,DistanceToCluster_8,DistanceToCluster_9,DistanceToCluster_10,DistanceToCluster_11,DistanceToCluster_12,DistanceToCluster_13,DistanceToCluster_14,DistanceToCluster_15
0,2021-11-11 20:42:09,85355,1504,view,0,Gardener battles to narrow win\n\nJason Garden...,sport,13.201747,0.247299,0.000368,...,0.404114,0.320817,0.422663,0.551457,0.45557,0.469253,0.472167,0.560197,0.482127,0.555989
1,2021-11-11 20:42:19,85355,1727,view,0,Hearts of Oak 3-2 Cotonsport\n\nHearts of Oak ...,sport,8.01586,0.35401,0.000223,...,0.29892,0.171978,0.276639,0.424193,0.321198,0.346568,0.282889,0.484594,0.434493,0.289179
2,2021-11-15 19:09:40,85355,93,view,0,German growth goes into reverse\n\nGermanys ec...,business,8.781742,0.283298,0.000245,...,0.447265,0.481445,0.476423,0.556708,0.397844,0.515427,0.470582,0.577941,0.57962,0.57804
3,2021-11-15 19:10:10,85355,3463,view,1,"Despite the pandemic, Etsys sales and business...",covid,3.244717,0.490515,9e-05,...,0.253308,0.264827,0.2279,0.303161,0.226482,0.297129,0.132213,0.462106,0.42379,0.423134
4,2021-11-15 19:11:28,85355,4105,view,0,"Roll on 2021, as if Covid 19 and the devastati...",covid,3.32295,0.467389,9.3e-05,...,0.409582,0.437764,0.403975,0.102288,0.409099,0.457617,0.34004,0.56165,0.541513,0.553212


In [24]:
df = pd.merge(df,
              user_info,
              on='user_id',
              how='left')

df.head()

Unnamed: 0,timestamp,user_id,post_id,action,target,text,topic,TotalTfIdf,MaxTfIdf,MeanTfIdf,...,DistanceToCluster_15,gender,age,country,city,exp_group,os,source,users_in_city,av_age_per_city
0,2021-11-11 20:42:09,85355,1504,view,0,Gardener battles to narrow win\n\nJason Garden...,sport,13.201747,0.247299,0.000368,...,0.555989,0,31,Russia,Ryazan,1,Android,ads,697,28.053085
1,2021-11-11 20:42:19,85355,1727,view,0,Hearts of Oak 3-2 Cotonsport\n\nHearts of Oak ...,sport,8.01586,0.35401,0.000223,...,0.289179,0,31,Russia,Ryazan,1,Android,ads,697,28.053085
2,2021-11-15 19:09:40,85355,93,view,0,German growth goes into reverse\n\nGermanys ec...,business,8.781742,0.283298,0.000245,...,0.57804,0,31,Russia,Ryazan,1,Android,ads,697,28.053085
3,2021-11-15 19:10:10,85355,3463,view,1,"Despite the pandemic, Etsys sales and business...",covid,3.244717,0.490515,9e-05,...,0.423134,0,31,Russia,Ryazan,1,Android,ads,697,28.053085
4,2021-11-15 19:11:28,85355,4105,view,0,"Roll on 2021, as if Covid 19 and the devastati...",covid,3.32295,0.467389,9.3e-05,...,0.553212,0,31,Russia,Ryazan,1,Android,ads,697,28.053085


Выделим признаки из timestamp и проверим пару гипотез

In [25]:
# От времени просмотра может зависеть склонность пользователей лайкать или игнорировать посты

df['hour'] = pd.to_datetime(df['timestamp']).apply(lambda x: x.hour)
df['month'] = pd.to_datetime(df['timestamp']).apply(lambda x: x.month)
df['day_of_week'] = pd.to_datetime(df['timestamp']).apply(lambda x: x.day_name())

df.head()

Unnamed: 0,timestamp,user_id,post_id,action,target,text,topic,TotalTfIdf,MaxTfIdf,MeanTfIdf,...,country,city,exp_group,os,source,users_in_city,av_age_per_city,hour,month,day_of_week
0,2021-11-11 20:42:09,85355,1504,view,0,Gardener battles to narrow win\n\nJason Garden...,sport,13.201747,0.247299,0.000368,...,Russia,Ryazan,1,Android,ads,697,28.053085,20,11,Thursday
1,2021-11-11 20:42:19,85355,1727,view,0,Hearts of Oak 3-2 Cotonsport\n\nHearts of Oak ...,sport,8.01586,0.35401,0.000223,...,Russia,Ryazan,1,Android,ads,697,28.053085,20,11,Thursday
2,2021-11-15 19:09:40,85355,93,view,0,German growth goes into reverse\n\nGermanys ec...,business,8.781742,0.283298,0.000245,...,Russia,Ryazan,1,Android,ads,697,28.053085,19,11,Monday
3,2021-11-15 19:10:10,85355,3463,view,1,"Despite the pandemic, Etsys sales and business...",covid,3.244717,0.490515,9e-05,...,Russia,Ryazan,1,Android,ads,697,28.053085,19,11,Monday
4,2021-11-15 19:11:28,85355,4105,view,0,"Roll on 2021, as if Covid 19 and the devastati...",covid,3.32295,0.467389,9.3e-05,...,Russia,Ryazan,1,Android,ads,697,28.053085,19,11,Monday


In [26]:
### Уберем все ненужные колонки

df = df.drop([
#    'timestamp',  ### timestamp пока оставим, понадобитмя потом для разделения данных
    'action',
    'text',
],
    axis=1)

df = df.set_index(['user_id', 'post_id'])

df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,timestamp,target,topic,TotalTfIdf,MaxTfIdf,MeanTfIdf,TextCluster,DistanceToCluster_1,DistanceToCluster_2,DistanceToCluster_3,...,country,city,exp_group,os,source,users_in_city,av_age_per_city,hour,month,day_of_week
user_id,post_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
85355,1504,2021-11-11 20:42:09,0,sport,13.201747,0.247299,0.000368,6,0.497094,0.491865,0.48831,...,Russia,Ryazan,1,Android,ads,697,28.053085,20,11,Thursday
85355,1727,2021-11-11 20:42:19,0,sport,8.01586,0.35401,0.000223,6,0.330679,0.376188,0.373057,...,Russia,Ryazan,1,Android,ads,697,28.053085,20,11,Thursday
85355,93,2021-11-15 19:09:40,0,business,8.781742,0.283298,0.000245,4,0.492646,0.539818,0.493022,...,Russia,Ryazan,1,Android,ads,697,28.053085,19,11,Monday
85355,3463,2021-11-15 19:10:10,1,covid,3.244717,0.490515,9e-05,11,0.170782,0.329329,0.316673,...,Russia,Ryazan,1,Android,ads,697,28.053085,19,11,Monday
85355,4105,2021-11-15 19:11:28,0,covid,3.32295,0.467389,9.3e-05,8,0.380096,0.481214,0.459853,...,Russia,Ryazan,1,Android,ads,697,28.053085,19,11,Monday


## Обучение модели

In [27]:
# Baseline решение возьмем с относительно простой модели - решающего дерева
# Далее вобмем бустинг

### Валидация:
# Так как данные имеют временную структуру train и test будем разбивать по колонке timestamp, 
# чтобы при решении не 'подглядывать ответы'

max(df.timestamp), min(df.timestamp)

(Timestamp('2021-12-29 23:43:31'), Timestamp('2021-10-01 06:01:52'))

In [28]:
# За отсечку возьмем 2021-12-15
### За отсечку возьмем 2021-12-15
split_date = '2021-12-15'

df_train = df[df.timestamp < split_date]
df_test = df[df.timestamp >= split_date]

df_train = df_train.drop('timestamp', axis=1)
df_test = df_test.drop('timestamp', axis=1)

X_train = df_train.drop('target', axis=1)
X_test = df_test.drop('target', axis=1)

y_train = df_train['target']
y_test = df_test['target']

y_train.shape, y_test.shape

((4477649,), (883050,))

In [29]:
X_train

Unnamed: 0_level_0,Unnamed: 1_level_0,topic,TotalTfIdf,MaxTfIdf,MeanTfIdf,TextCluster,DistanceToCluster_1,DistanceToCluster_2,DistanceToCluster_3,DistanceToCluster_4,DistanceToCluster_5,...,country,city,exp_group,os,source,users_in_city,av_age_per_city,hour,month,day_of_week
user_id,post_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
85355,1504,sport,13.201747,0.247299,0.000368,6,0.497094,0.491865,0.488310,0.525735,0.501511,...,Russia,Ryazan,1,Android,ads,697,28.053085,20,11,Thursday
85355,1727,sport,8.015860,0.354010,0.000223,6,0.330679,0.376188,0.373057,0.385115,0.419553,...,Russia,Ryazan,1,Android,ads,697,28.053085,20,11,Thursday
85355,93,business,8.781742,0.283298,0.000245,4,0.492646,0.539818,0.493022,0.534006,0.148324,...,Russia,Ryazan,1,Android,ads,697,28.053085,19,11,Monday
85355,3463,covid,3.244717,0.490515,0.000090,11,0.170782,0.329329,0.316673,0.310330,0.323345,...,Russia,Ryazan,1,Android,ads,697,28.053085,19,11,Monday
85355,4105,covid,3.322950,0.467389,0.000093,8,0.380096,0.481214,0.459853,0.434850,0.484082,...,Russia,Ryazan,1,Android,ads,697,28.053085,19,11,Monday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87081,2725,covid,3.613683,0.375296,0.000101,11,0.148541,0.323117,0.314836,0.229422,0.364581,...,Russia,Rostov,2,Android,ads,1541,27.018819,9,11,Tuesday
87081,4775,movie,10.622235,0.395886,0.000296,7,0.425759,0.358584,0.440865,0.460227,0.485738,...,Russia,Rostov,2,Android,ads,1541,27.018819,9,11,Tuesday
87081,2852,covid,3.103873,0.417524,0.000087,11,0.113985,0.357601,0.338635,0.237227,0.390162,...,Russia,Rostov,2,Android,ads,1541,27.018819,9,11,Tuesday
87081,4377,movie,4.143706,0.382769,0.000115,1,0.454860,0.216026,0.488070,0.505266,0.534996,...,Russia,Rostov,2,Android,ads,1541,27.018819,9,11,Tuesday


#### Baseline - решающее дерево

In [30]:
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from category_encoders.one_hot import OneHotEncoder

object_cols = [
    'topic', 'TextCluster', 'gender', 'country',
    'city', 'exp_group', 'hour', 'month', 'day_of_week',
    'os', 'source'
]

cols_for_ohe = [x for x in object_cols if X_train[x].nunique() < 5]
cols_for_mte = [x for x in object_cols if X_train[x].nunique() >= 5]

### Cохраним индексы этих колонок

cols_for_ohe_idx = [list(X_train.columns).index(col) for col in cols_for_ohe]
cols_for_mte_idx = [list(X_train.columns).index(col) for col in cols_for_mte]

transforms = [
    ('OneHotEncoder', OneHotEncoder(), cols_for_ohe_idx),
    ('MeanTargetEncoder', TargetEncoder(), cols_for_mte_idx)
]

from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

pipe_dt = Pipeline([("column_transformer",
                     ColumnTransformer(transformers=transforms)),
                     
                    ("decision_tree", 
                     DecisionTreeClassifier())])

pipe_dt.fit(X_train, y_train)

In [31]:
# Замерим качество работы такой модели
# Возьмем ROC-AUC как просили в изначальной постановке задания

from sklearn.metrics import roc_auc_score

print(f"Качество на трейне: {roc_auc_score(y_train, pipe_dt.predict_proba(X_train)[:, 1])}")
print(f"Качество на тесте: {roc_auc_score(y_test, pipe_dt.predict_proba(X_test)[:, 1])}")

Качество на трейне: 0.9441012533887414
Качество на тесте: 0.5281773709027405


Видим сильное переобучение и, конечно, качество 0.52 не совсем показательно для нашей модели так как у нас в датасете используются далеко не все имеющиеся данные, и прогнозы по-хорошему надо делать для каждого пользователя по всем постам, а не только по тем что попали в выборку.

#### CatBoost

In [34]:
# Теперь обучим катбуст без настройки парамтеров, настраивать парамтеры будем позднее

from catboost import CatBoostClassifier

catboost = CatBoostClassifier(iterations = 200, task_type='GPU', verbose=100)

catboost.fit(X_train, y_train, object_cols)

Learning rate set to 0.089793
0:	learn: 0.6238853	total: 516ms	remaining: 1m 42s
100:	learn: 0.3369110	total: 52.7s	remaining: 51.7s
199:	learn: 0.3351771	total: 1m 42s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7a4f9c4adc00>

In [35]:
# Замерим качество работы такой модели
# Возьмем ROC-AUC

print(f"Качество на трейне: {roc_auc_score(y_train, catboost.predict_proba(X_train)[:, 1])}")
print(f"Качество на тесте: {roc_auc_score(y_test, catboost.predict_proba(X_test)[:, 1])}")

Качество на трейне: 0.692357833693662
Качество на тесте: 0.6640327387970089


Видим сильное улучшение качества и меньшую переобученность даже для почти базовой реализации CatBoost

### Немного поэксперементируем с параметрами для CatBoost чтобы получить лучшее качество

##### ***!!!Лучше не запускать ячейку ниже так как искать ответ будет долго и значимых улучшений в качестве не нашли***

In [None]:
from sklearn.model_selection import GridSearchCV

# Определим сетку параметров поиска
param_grid = {
    'iterations': [100, 200 ],            # Количество итераций обучения
    'learning_rate': [1, 0.1, 0.01 ],      # Скорость обучения
    'depth': [2, 4, 6],                       # Глубина дерева
    'auto_class_weights': ['None', 'Balanced'] # Автоматическое взвешивание классов
}

# Создаем экземпляр GridSearchCV
grid_search = GridSearchCV(estimator=catboost, param_grid=param_grid, cv=5, scoring='roc_auc')

# Прогоняем GridSearchCV на обучающих данных
grid_search.fit(X_train, y_train, cat_features=object_cols)

In [38]:
# Выведем лучшие параметры и  лучший результат

print(grid_search.best_params_)

print(grid_search.best_score_)

{'auto_class_weights': 'Balanced', 'depth': 6, 'iterations': 200, 'learning_rate': 0.1}
0.6681578049499394


In [None]:
best_catboost = CatBoostClassifier(**grid_search.best_params_, verbose = 50)

best_catboost.fit(X_train, y_train, cat_features=object_cols)


In [40]:
# Замерим качество работы такой модели
# Возьмем ROC-AUC

print(f"Качество на трейне: {roc_auc_score(y_train, best_catboost.predict_proba(X_train)[:, 1])}")
print(f"Качество на тесте: {roc_auc_score(y_test, best_catboost.predict_proba(X_test)[:, 1])}")

Качество на трейне: 0.7031747316411989
Качество на тесте: 0.66224770676795


В итоге по поиску значимого улучшения не увиели, можно увеличить количество итераций до 1000 (проводил эксперимент отдельно), это даст прибавку в несколько сотых, но займет много времени. В целях экономии времени не стал запускать этот поиск.

# Итоги

In [None]:
### Посмотрим на итоговую feature_importance, чтобы оцениьт насколько хорошие признаки были сгенерированы

import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

def plot_feature_importance(importance, names, model_type):
    # Преобразование значений в массивы признаков и их значимости
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    # Создание DataFrame с использованием словаря
    fi_df = pd.DataFrame({'feature_names': feature_names, 'feature_importance': feature_importance})
    
    # Сортировка DataFrame по убыванию значимости признаков
    fi_df = fi_df.sort_values(by='feature_importance', ascending=False)
    
    # Определение размеров графика
    plt.figure(figsize=(10, 8))
    # Построение графика с использованием Seaborn
    sns.barplot(x='feature_importance', y='feature_names', data=fi_df, palette='viridis')
    # Добавление названий и меток к графику
    plt.title(model_type + ' FEATURE IMPORTANCE', fontsize=16)
    plt.xlabel('FEATURE IMPORTANCE', fontsize=14)
    plt.ylabel('FEATURE NAMES', fontsize=14)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)

plot_feature_importance(catboost.feature_importances_, X_train.columns, 'Catboost')


Видим что смогли создать несколько важных параметров для построения модели таких как: month, users_in_city,Mean_TFIDF, Text_Cluster

### Сохраним модель и положим в базу фичи, необходимые для функционала нашей модели

In [37]:
catboost.save_model(
    'model_control',
    format="cbm"                  
)

In [38]:
posts_info.to_sql(    
   "koriakov_posts_info_features",                    
    con=connection_path,                      
    schema="public",                   
    if_exists='replace'            
   )                               
                                   

23