In [1]:
import os
import time
import numpy as np
import pandas as pd 
from dotenv import load_dotenv

from sqlalchemy import create_engine, inspect
load_dotenv('./.env')

True

In [2]:
DB_IP = os.environ.get("DB_IP")
DB_PORT = os.environ.get("DB_PORT")
DB_NAME = os.environ.get("DB_NAME")
DB_USER = os.environ.get("DB_USER")
DB_PASSWORD = os.environ.get("DB_PASSWORD")

url = f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_IP}:{DB_PORT}/{DB_NAME}"
engine = create_engine(url)
inspector = inspect(engine)

inspector.get_schema_names()

['information_schema', 'ml_house', 'parse', 'public']

In [3]:
start = time.time()

metrics = pd.read_sql_table("posts_metrics", schema="parse", con=engine)

print((time.time() - start)/60)

10.326252210140229


In [4]:
metrics.sample(3)

Unnamed: 0,id,views,forwards,comments,paid_reactions,standard_reactions,custom_reactions
3530776,5209785,69436.0,145.0,130.0,0,"{'🔥': 2078, '🤪': 162, '❤': 125, '👍': 105, '🤣':...",{}
2205525,3876198,11265.0,14.0,25.0,0,{'👍': 27},{}
9704313,11351330,9601.0,34.0,,0,"{'👍': 88, '🥴': 36, '❤': 8, '🤩': 5, '🥰': 4, '🤔'...",{}


In [6]:
metrics.standard_reactions.iloc[3530776]

{'🔥': 2078,
 '🤪': 162,
 '❤': 125,
 '👍': 105,
 '🤣': 33,
 '🤬': 25,
 '🌭': 24,
 '🌚': 22,
 '😐': 11,
 '😁': 10,
 '💘': 9}

In [7]:
metrics.shape

(17544153, 7)

# Feature engineering

In [8]:
metrics["sum_standard_reactions"] = metrics.standard_reactions.apply(lambda x: sum(x.values()))
metrics["sum_custom_reactions"] = metrics.custom_reactions.apply(lambda x: sum(x.values()))
metrics["total_sum_reactions"] = metrics["sum_standard_reactions"] + metrics["sum_custom_reactions"]

In [9]:
top_reactions = (
    pd.Series(
        [key for line in metrics.standard_reactions for key in line.keys()] 
    )
    .value_counts(ascending=False)
)

top_reactions.head(10)

👍      6510340
❤      5793166
🔥      3874934
😁      1652689
👎      1421439
❤‍🔥    1296481
👏      1249224
🤔       867314
😢       761499
🥰       754676
Name: count, dtype: int64

In [12]:
k = 5
top_k_reactions = list(top_reactions.index[:k])
print(top_k_reactions)

['👍', '❤', '🔥', '😁', '👎']


In [13]:
map_names = ["good_finger", "heart", "fire", "fun", "bad_finger"]
mapping = {k: v for k,v in zip(map_names, top_k_reactions)}
mapping

{'good_finger': '👍', 'heart': '❤', 'fire': '🔥', 'fun': '😁', 'bad_finger': '👎'}

In [14]:
def get_good_finger(x):

    if mapping['good_finger'] in x.keys():
        return x[mapping['good_finger']]
    
    else: return 0 
    
def get_heart(x):

    if mapping['heart'] in x.keys():
        return x[mapping['heart']]
    
    else: return 0 

def get_fire(x):

    if mapping['fire'] in x.keys():
        return x[mapping['fire']]
    
    else: return 0 

def get_fun(x):

    if mapping['fun'] in x.keys():
        return x[mapping['fun']]
    
    else: return 0 

def get_bad_finger(x):

    if mapping['bad_finger'] in x.keys():
        return x[mapping['bad_finger']]
    
    else: return 0 

In [15]:
metrics["good_finger"] = metrics.standard_reactions.map(get_good_finger)
metrics["heart"] = metrics.standard_reactions.map(get_heart)
metrics["fire"] = metrics.standard_reactions.map(get_fire)
metrics["fun"] = metrics.standard_reactions.map(get_fun)
metrics["bad_finger"] = metrics.standard_reactions.map(get_bad_finger)

metrics.sample()

Unnamed: 0,id,views,forwards,comments,paid_reactions,standard_reactions,custom_reactions,sum_standard_reactions,sum_custom_reactions,total_sum_reactions,good_finger,heart,fire,fun,bad_finger
6781969,8450431,143.0,0.0,,0,{},{},0,0,0,0,0,0,0,0


In [16]:
metrics["total_activites"] = metrics["forwards"] + metrics["comments"] + metrics["paid_reactions"] + metrics["sum_custom_reactions"]
metrics.sample()

Unnamed: 0,id,views,forwards,comments,paid_reactions,standard_reactions,custom_reactions,sum_standard_reactions,sum_custom_reactions,total_sum_reactions,good_finger,heart,fire,fun,bad_finger,total_activites
4366893,6037630,7144.0,14.0,,0,{},{},0,0,0,0,0,0,0,0,


# relative features

In [17]:
cols = list(set(metrics.columns) - set(["id", "views", "standard_reactions", "custom_reactions"]))

for x in cols: 
    metrics[f"relative_{x}"] = metrics[x].fillna(0) / metrics["views"]

In [18]:
metrics.to_csv("../data/metrics_fe.csv", index=False)

# restart kernel, clean ram for saving

In [22]:
import os
import time
import numpy as np
import pandas as pd 
from dotenv import load_dotenv

from sqlalchemy import create_engine, inspect
load_dotenv('./.env')

True

In [23]:
DB_IP = os.environ.get("DB_IP")
DB_PORT = os.environ.get("DB_PORT")
DB_NAME = os.environ.get("DB_NAME")
DB_USER = os.environ.get("DB_USER")
DB_PASSWORD = os.environ.get("DB_PASSWORD")

url = f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_IP}:{DB_PORT}/{DB_NAME}"
engine = create_engine(url)
inspector = inspect(engine)

In [24]:
metrics = pd.read_csv("../data/metrics_fe.csv")

In [25]:
start = time.time()
basis_ad = pd.read_sql_table("ad_basis", schema="ml_house", con=engine)
print((time.time() - start)/60)

0.5906165242195129


In [26]:
type(basis_ad.id.iloc[0]), type(metrics.id.iloc[0])

(numpy.int64, numpy.int64)

In [27]:
basis_with_metrics = (
    basis_ad
    .merge(
        metrics,
        on="id",
        how="inner"
    )
)

basis_with_metrics.shape

(177373, 45)

In [28]:
basis_with_metrics.sample(3)

Unnamed: 0,id,raw_text,urls,geo,poll,via_bot_id,via_business_bot_id,cnt_urls,target_flag1,target_flag2,...,relative_good_finger,relative_sum_custom_reactions,relative_forwards,relative_heart,relative_comments,relative_fun,relative_sum_standard_reactions,relative_bad_finger,relative_fire,relative_total_activites
34806,11730553,💬СушиSell это про людей\n\nЕсли тебе близки це...,{https://t.me/+oQDVky2_lkozYWM6},,,,,1,0,0,...,0.000642,0.0,0.002087,0.0,0.0,0.0,0.000642,0.0,0.0,0.0
150612,4313027,erid: 5jtCeReNx12oajt4ZrEdVWg\n\nСмартфон Real...,"{https://t.me/zakazhisam/47424,https://telegra...",,,,,2,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66610,13871483,"👼С Рождеством! \n\nЖелаем, чтобы в доме царили...",{https://bit.ly/3NFvR7O},,,,,1,0,0,...,0.002051,0.0,0.000373,0.005695,0.00039,0.0,0.009628,6.8e-05,0.00061,0.000763


In [30]:
start = time.time()
basis_with_metrics.to_sql("basis_with_metrics", schema="ml_house", con=engine)
print((time.time() - start)/60)

27.12190844217936
