# Feature engineering 2

This notebooks generates the final features for each candidate.

- Input: feature engineering 1, network metrics
- Output: twitter metric features, sentiment features and network features for each candidate

## Imports

In [1]:
import pandas as pd
import logging
import pymongo
import os
import pyathena
import dotenv
import os
import sys
import datetime
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

#from utils.mongodb import *

In [2]:
logging.basicConfig(format='[%(asctime)s] - %(name)s - %(funcName)s - %(levelname)s : %(message)s', level=logging.INFO)
log = logging.getLogger(__name__)

dotenv.load_dotenv(os.path.join(module_path, '.env'))

mongo_client = pymongo.MongoClient(os.environ["MONGODB_URL"])

twitter_db = mongo_client.TwitterConstituyenteDB

conn = pyathena.connect(s3_staging_dir=os.environ["AWS_ATHENA_S3_STAGING_DIR"], 
        region_name=os.environ["AWS_REGION"])

query = """
SELECT * FROM "twitter-constituyente"."constituyentes_full";
"""
candidates_df = pd.read_sql(query, conn)
candidates_df["electoral_district"] = candidates_df["electoral_district"].astype("str")
candidates_ids = candidates_df["user__id_str"].dropna().to_list()


candidates_data_df = candidates_df.dropna(subset=["user__id_str"]).set_index("user__id_str")
candidates_data_df["rm"] = candidates_data_df["electoral_district"].isin(list(map(str, range(8, 15)))).astype("int")
candidates_data_df["district_percentage"] /= 100

district_df = candidates_data_df[["electoral_district"]]
percentage_df = candidates_data_df[["district_percentage"]]


[2022-01-23 11:55:02,418] - botocore.credentials - load - INFO : Found credentials in environment variables.


## Loading features data

In [3]:
def network_7d_collection_to_agg_df(collection):
  df = pd.DataFrame(collection.find({"user__id_str": {"$in": candidates_ids}}))
  df = (df
        .rename(columns={"window_end": "date"})
        .sort_values("date")
        .set_index(["date", "user__id_str"]))
  return df

network_7d_collections = [
  twitter_db.network_metrics_7d_A,
  twitter_db.network_metrics_7d_pagerank
]

network_7d_collections_pop = [
  twitter_db.network_metrics_7d_pop_A,
  twitter_db.network_metrics_7d_pagerank_pop
]

network_total_collections = [
  twitter_db.network_metrics_A,
  twitter_db.network_metrics_pagerank_1,
  twitter_db.network_metrics_pagerank_2,
  twitter_db.network_metrics_pagerank_3
]

network_total_collections_pop = [
  twitter_db.network_metrics_pop_A,
  twitter_db.network_metrics_pagerank_pop                     
]


def get_raw_network_features():
  df1 = pd.concat([network_7d_collection_to_agg_df(col) for col in network_7d_collections], axis=1)
  df2 = pd.concat([network_7d_collection_to_agg_df(col) for col in network_7d_collections_pop], axis=1)
  return pd.concat([df1, df2.add_suffix("__pop")], axis=1)


def get_raw_twitter_sentiment_features():
  features_df = (pd.read_parquet("./twitter_sentiment_features.parquet")
    .sort_values(["date", "user.id_str"]).rename(columns={"user.id_str":"user__id_str"}))
  features_df['week_monday'] = features_df['date'].dt.to_period('W').dt.start_time
  return features_df.set_index(["date", "user__id_str"])


def get_raw_features(): 
  return pd.concat([
    get_raw_network_features(),
    get_raw_twitter_sentiment_features()
  ], axis=1)


In [4]:
raw_features_df = get_raw_features().reset_index()
raw_features_df.columns = raw_features_df.columns.str.lower()

def week_avg_raw_features(df, start_date, end_date):
    df = df[(df["date"]>=start_date) & (df["date"]<=end_date)].fillna(0)
    n_days = (end_date - start_date).days + 1
    n_weeks = n_days/7
    log.info(f"number of days: {n_days}, number of weeks: {n_weeks}")
    # promedio semanal = promedio diario * 7
    return df.groupby("user__id_str").sum()/n_weeks

## Base features

In [5]:
# all period 18 weeks mean features
start = datetime.datetime(2021, 1, 9)
middle1 = datetime.datetime(2021, 4, 30)
middle2 = datetime.datetime(2021, 5, 1)
end = datetime.datetime(2021, 5, 14)

In [6]:
week_agg_features_df = week_avg_raw_features(raw_features_df, start, end)
week_agg_features_df.reset_index().to_parquet("final_features.parquet", index=False)

[2022-01-23 11:58:31,588] - __main__ - week_avg_raw_features - INFO : number of days: 126, number of weeks: 18.0


## First 16 weeks vs Last 2 weeks features

In [7]:
# first 16 weeks - last 2 weeks feature
week_agg_features_first16_df = week_avg_raw_features(raw_features_df, start, middle1)
week_agg_features_first16_df.reset_index().to_parquet("final_features_first16weeks.parquet", index=False)


week_agg_features_last2_df = week_avg_raw_features(raw_features_df, middle2, end)
week_agg_features_last2_df.reset_index().to_parquet("final_features_last2weeks.parquet", index=False)

[2022-01-23 11:58:32,680] - __main__ - week_avg_raw_features - INFO : number of days: 112, number of weeks: 16.0
[2022-01-23 11:58:32,868] - __main__ - week_avg_raw_features - INFO : number of days: 14, number of weeks: 2.0


## List aggregated features

In [8]:
list_agg_base_df = week_agg_features_df.join(candidates_data_df[["electoral_district", "list"]])
list_agg_base_df["list"] = list_agg_base_df["electoral_district"].apply(lambda d: f"D{d}&") + list_agg_base_df["list"]
list_agg_base_df = list_agg_base_df.reset_index(drop=True).drop(columns=["electoral_district"]).groupby("list").sum()

# agg features and base features are the same
assert list_agg_base_df.columns.to_list() == week_agg_features_df.columns.to_list()
list_agg_base_df.reset_index().to_parquet("final_features_list_agg.parquet", index=False)