# This script generates the most informative features for individual text features. The text features are domains, mentions, hashtags, followees, and text (Tweets and Retweets).

In [94]:
# general packages 
import pandas as pd
import numpy as np
import pickle
from urllib.request import urlopen
import re
import sys
import copy
import pyarrow.parquet as pq
import pyarrow as pa
import time
import os
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

# pipeline packages 
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from scipy.stats import pearsonr

# Informative Features

In [None]:
"""
This function performs feature extraction using TF-IDF, runs Ridge with cross-validation, and prints most informative features.

Inputs:
data (pandas df): master dataframe 
feat (string): name of feature
target (string): CRT target feature (numeric, conceptual, or both)
results_folder (string): path to informative features results folder
n (integer): number of top features to show
maxDF (float): TF-IDF max_df parameter
minDF (float): TF-IDF min_df parameter
n_gram ((integer, integer)): TF-IDF n_gram range parameter 
my_state (integer): random state for train/test split
display (boolean): if True, display chart

Outputs
r (float): Pearson r correlation coefficient after Ridge and cross-validation
p (float): p value after Ridge and cross-validation
"""
def get_informative_features(data, feat, target, results_folder, n=10, maxDF=1.0, minDF=10, n_gram=(1,1), my_state=17, display=True):
    
    # create coefficient dictionary 
    coefs_dict = {
        'high_coefs': [],
        'high_names': [],
        'low_coefs': [],
        'low_names': []
    }
    
    if feat == 'text':
        vectorizer = TfidfVectorizer(ngram_range=n_gram, max_df= maxDF, min_df=minDF, use_idf=True,binary=False, analyzer='word', token_pattern=r'(?ui)\b\w[a-z]+\w[a-z]+\w[a-z]+\b')
    elif feat == "followees": 
        vectorizer = TfidfVectorizer(ngram_range=n_gram,max_df= maxDF,min_df = minDF,use_idf=True,binary=True)
    elif feat == 'domains': 
        vectorizer = TfidfVectorizer(ngram_range=n_gram,max_df= maxDF,min_df = minDF,use_idf=True,binary=False)
    elif feat == 'hashtags': 
        vectorizer = TfidfVectorizer(ngram_range=n_gram,max_df= maxDF,min_df = minDF,use_idf=True,binary=False)
    elif feat == 'mentions': 
        vectorizer = TfidfVectorizer(ngram_range=n_gram,max_df=maxDF,min_df = minDF,use_idf=True,binary=False)
    elif feat in ['bio', 'follower_bios', 'followee_bios']:
        vectorizer = TfidfVectorizer(ngram_range=n_gram,max_df=maxDF,min_df = minDF,use_idf=True,binary=False, analyzer='word', token_pattern=r'(?ui)\b\w[a-z]+\w[a-z]+\b')
    
    params = {'alpha': np.logspace(-5, 5, 100)}
    
    data = data[data[feat].notnull()]
    X_text = vectorizer.fit_transform(data[feat]) 
    Y = data[target]
    
    bins = np.linspace(0, 1, 7)
    y_binned = np.digitize(Y, bins)
    
    # run Ridge regression + cross-validation
    X_train, X_test, Y_train, Y_test  = train_test_split(X_text, Y, test_size=0.2, stratify=y_binned, random_state=my_state)

    grid = GridSearchCV(Ridge(), param_grid=params, cv=10, n_jobs=-1, verbose=0, scoring='neg_mean_squared_error')
    grid.fit(X_train, Y_train)
    y_pred = grid.predict(X_test)
    y_pred=np.maximum(0, np.minimum(y_pred, 1))

    coefs_with_fns = sorted(zip(grid.best_estimator_.coef_, vectorizer.get_feature_names_out()))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        coefs_dict['low_coefs'].append(coef_1)
        coefs_dict['low_names'].append(fn_1)
        coefs_dict['high_coefs'].append(coef_2)
        coefs_dict['high_names'].append(fn_2)

    r, p = (pearsonr(Y_test, y_pred))

    chart = create_plotly(coefs_dict, feat, target, r, p, minDF, maxDF, n_gram)
    
    if display:
        chart
        
    isExist = os.path.exists(results_folder)
    if not isExist:
        os.makedirs(results_folder)
        
    chart.write_html(results_folder + "informative_features_{}_{}.html".format(feat, n_gram))
    
    return (r, p, chart)


In [125]:
"""
This function generates a plotly chart from informative features module.

Inputs:
coefs_dict (dictionary): contains four keys (high_coefs, high_names, low_coefs, low_names)
feat (string): name of feature
r (float): Pearson r correlation coefficient after Ridge and cross-validation
p (float): p value after Ridge and cross-validation
min_df (float): TF-IDF min_df parameter
max_df (float): TF-IDF max_df parameter
n_gram ((integer, integer)): TF-IDF n_gram range parameter 

Outputs: 
fig (plotly): chart displaying feature selection results
"""
def create_plotly(coefs_dict, feat, target, r, p, min_df, max_df, n_gram):
    
    # create chart
    chart=[go.Table(
    header=dict(values=['Lowest Coefficients', 'Feature Names (Lowest)', 
                        'Highest Coefficients', 'Feature Names (Highest)'],
                fill_color='papayawhip',
                align='left'),
    cells=dict(values=[["{:.4f}".format(x) for x in coefs_dict['low_coefs']], coefs_dict['low_names'],
                       ["{:.4f}".format(x) for x in coefs_dict['high_coefs']], coefs_dict['high_names']],
               fill=dict(color=['lightgray', 'snow', 'lightgray', 'snow']),
               align='left'))]
    
    layout = go.Layout(
    width=1000,
    height=600,
        
    annotations=[
        go.layout.Annotation(
            showarrow=False,
            text='min_df: {}, max_df: {}, n_gram: {}'.format(min_df, max_df, n_gram),
            xanchor='center',
            x=.5,
            yanchor='bottom',
            y=1.03
        )])
    
    fig = go.FigureWidget(data=chart, layout=layout)
    
    fig.update_layout(
    title={
            'text': "Informative Features for {} (r: {:.3f}; p value: {:.3f})".format(feat, r, p),
            'y':.93,
            'x':0.5,
            'font': dict(
                size=17,
            ),
            'xanchor': 'center',
            'yanchor': 'top'},
    font=dict(
            family="Courier New, monospace",
            color="black",
            size=12
        )
    )

    return fig