In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import math

### Literal Advanced Analysis
This notebook used the csv file produced by `literal_analysis` notebook. This file contains an aggregation of the global call dataset with one row for each argument (per function) with the most frequent literal and the literal ratio. 

In [None]:
summary_filename = "literalsCompSample_all_summarized.csv"
df = pd.read_csv(summary_filename)

In [None]:
df["literal_ratio"].hist(weights=df["count"], bins=50)

In [None]:
df.columns

Deifnition of a simple score on how predictable is a literal. To avoid bias because of `LIST`, `TUPLE`, ... tokens, we set a score of 0 as soon as one of the most frequent literal is one of those. 

In [None]:
def literal_predict_score(row):
    if row['literal_ratio'] < 0.4:
        return 0
    mfv = str(row['most_freq_values'])
    if 'LIST' in mfv or 'COMPREHENSION' in mfv or 'TUPLE' in mfv or 'DICT' in mfv:
        return 0
    return math.log(row['count'])*row['literal_ratio']*row["most_freq_values_ratio"]/math.sqrt(row["unique_values_count"])

def corrected_most_freq_ratio(row):
    if row['literal_ratio'] < 0.2:
        return -1
    mfv = str(row['most_freq_values'])
    if 'LIST' in mfv or 'COMPREHENSION' in mfv or 'TUPLE' in mfv or 'DICT' in mfv:
        return -1
    if row['literal_count'] < 10:
        return -1
    if np.isnan(row['most_freq_values_ratio']):
        return -1
    return row['most_freq_values_ratio']

df['score'] = df.apply(literal_predict_score, axis=1)
df['corrected_mfvr'] = df.apply(corrected_most_freq_ratio, axis=1)

In [None]:
df.sort_values(by='score', ascending=False)

Let's plot the score along the count of call for these args. We are mostly interested by function with high score and a lot of call. The color gives information on the literal ratio. 

In [None]:
target = df[df['score'] > 0.2]
target.plot.scatter(x="score", y="count", c="literal_ratio",  colormap='viridis', sharex=False )

In [None]:
target["count"].sum()

In [None]:
df['log_count'] = np.log(df['count'])

In [None]:
df.plot(kind='scatter', x='literal_ratio', y='most_freq_values_ratio', c='log_count', colormap='viridis', sharex=False)

In [None]:
valid_most_freq = df[df['corrected_mfvr'] >= 0]

In [None]:
valid_most_freq['corrected_mfvr'].plot(kind='hist', bins=50, title="Most freq. values ratio")

In [None]:
len(valid_most_freq)

In [None]:
valid_most_freq.sample(20)