# Analysis of the impact of supporting literal and keyword for autocompletion of arguments in function call
This analysis use the data produced by the local-pipeline `khulnasoft-lab/local-pipelines/python-offline-metrics/cmds/literal-analysis`.

The Quip document https://fastnode.quip.com/bzVRAmaORGLk/Impact-of-supporting-new-argument-types present the results obtained from this analysis. 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import urllib
import numpy as np
import json

# generated by khulnasoft-lab/local-pipelines/python-offline-metrics/cmds/literal-analysis
REPORT_FILENAME = 'literalsCompSample_all.json'



def preprocess(obj):
    return obj

def get_data(filename):
    records = [preprocess(json.loads(line))
              for line in open(filename, 'r').readlines()]
    return pd.io.json.json_normalize(records)
    
def render(df):
    def make_link(url):
        return '<a href="{}">{}</a>'.format(url, "inspect")
    return df.style.format({'inspect': make_link})

df = get_data(REPORT_FILENAME)
N = len(df.index)

df.info()

Percentage of different ArgType over the corpus, numerically and as a pie chart

In [None]:
df.ArgType.value_counts()/len(df)

In [None]:
argsType = df.ArgType.value_counts()
def build_type_pie_charts(argsType, title=""):
    total = argsType.sum()
    threshold = total/50 # We want items which are at least 2% of the total
    big_items = argsType[argsType > threshold]
    small_items = argsType[argsType <= threshold]
    types = big_items.append(pd.Series({"Other Types": small_items.sum()}))
    return types.plot(kind='pie', title=title)

build_type_pie_charts(argsType)


### Focus on a per function basis
The next step of the analysis will focus more on a per function grouping. Does some function use more literal arguments than others? 

In [None]:
function_grouping = df.groupby("FunctionName")
function_grouping.size()

In [None]:
df['CallID'].max()

Some tooling to identify quickly if an arg is a literal. 

In [None]:
literal_types = ['*pythonast.ComprehensionExpr', '*pythonast.DictExpr', '*pythonast.ListComprehensionExpr',
'*pythonast.ListExpr','*pythonast.NumberExpr','*pythonast.StringExpr','*pythonast.TupleExpr']

def is_literal(row):
    return row['ArgType'] in literal_types

In [None]:
df['is_literal'] = df.apply(is_literal, axis=1)

What is the global percentage of literal in the args?

In [None]:
df.is_literal.value_counts().plot(kind='pie')

In [None]:
df.is_literal.mean()

Drill down to separate the different args for each function. We are using the ArgSpec to group together named arguments and positional arguments which correspond to the same _arg_ in the function prototype

In [None]:
per_arg_grouping = df.groupby(["FunctionName", "ArgSpec"])
per_arg_serie = per_arg_grouping.size().sort_values(ascending=False).reset_index().rename(index=str, columns={0:'Count'})
per_arg_serie


In [None]:
literal_avg = per_arg_grouping["is_literal"].agg({"literal ratio":np.mean, "count": 'count'})

The next cell present the ratio or literal/total_call for each arguments in known function. 
Our main target is the arguments with a high rate of literal calls. 

In [None]:
literal_avg.sort_values(ascending=False, by=['literal ratio', 'count']).sample(25)

In [None]:
"""
This function provides a few metrics on each group (each group contains all the values for a given argument of a function)
The main point of studies is the diversity of the literals used as value. 
If this diversity is low, that means it will be easier to predict the values.
"""
def aggregate_group_info(group):
    count = len(group)
    literal_ratio = group["is_literal"].mean()
    literal_elements = group[group['is_literal']]
    uniques_values = literal_elements.groupby("LiteralValue").size()
    literal_count = len(literal_elements)
    val_count = len(uniques_values)
    most_frequent_values = uniques_values.sort_values(ascending=False)[:5]
    most_freq_lit_repr = "  ".join(most_frequent_values.index.astype('str'))
    most_freq_ratio = most_frequent_values.sum()/literal_count    
    
    #print("Count {}, ratio {}, diff values {} most_freq_ratio {}".format(count, literal_ratio, val_count, most_freq_ratio))
    return pd.Series(dict(
        count= count,
        literal_count= literal_count,
        literal_ratio= literal_ratio,
        unique_values_count= val_count,
        most_freq_values_ratio= most_freq_ratio,
        most_freq_values= most_freq_lit_repr
    ))
    
summarize_function_call = per_arg_grouping.apply(aggregate_group_info)
summarize_function_call.reset_index()

In [None]:
summarize_function_call.loc['django.utils.safestring.mark_safe', :]

Let's save the aggregated dataset in a csv file for some more advanced analysis/summary (cf `literal_summary` notebook which used this file to go a bit further)

In [None]:
summarize_function_call.reset_index().to_csv(REPORT_FILENAME[:-5]+"_summarized.csv")

In [None]:
summarize_function_call.loc[summarize_function_call['FunctionName'] == 'django.utils.safestring.mark_safe']

#### Focus on NumberExpr
What are the number used as literal, and how does their distribution looks? 

In [None]:
numbers_only = df[df['ArgType'] == '*pythonast.NumberExpr']
grouped_numbers = numbers_only.groupby("LiteralValue")

In [None]:
number_serie = numbers_only['LiteralValue'].value_counts().sort_values(ascending=False)/len(numbers_only)

In [None]:
number_serie.nlargest(10).reset_index()

### Focus on StringExpr
Same things on String

In [None]:
strings_only = df[df['ArgType'] == '*pythonast.StringExpr']
grouped_strings = strings_only.groupby("LiteralValue")

In [None]:
string_serie = strings_only['LiteralValue'].value_counts().sort_values(ascending=False)


In [None]:
string_serie.nlargest(10).reset_index()

# Keyword arguments
The next section analyze how frequently arguments are named in a call and the keyword distribution

In [None]:
df.Positional.mean()

In [None]:
df.Positional.value_counts().plot(kind='pie')

Creation of a direct indicator for named arguments

In [None]:
df["is_named"] = ~df['Positional']

In [None]:
per_arg_grouping = df.groupby(["FunctionName", "ArgSpec"])

The `named arg ratio` corresponds to how frequently a given argument for a function is named. 1 means always named, 0 means never named. 
Note: An argument captured by `**KWARGS` is always named.


In [None]:
named_avg = per_arg_grouping["is_named"].agg({"named arg ratio":np.mean, "count": 'count'})

In [None]:
named_avg.sample(25).sort_values(ascending=False, by=['named arg ratio', 'count'])

In [None]:
named_avg["named arg ratio"].plot(kind='hist', bins=50, log=False, title="Non-weighted Distribution")

In [None]:
named_avg["named arg ratio"].plot(kind='hist', weights=named_avg["count"], bins=50, log=False, title="Weighted Distribution")

The next steps will focus on arguments that are most frequently named (`named arg ratio > 0.9`).

In [None]:
mostly_named = named_avg[named_avg['named arg ratio'] > 0.9].reset_index()

In [None]:
mostly_named["is_kwargs"] = mostly_named['ArgSpec'] == '**KWARGS'

In [None]:
mostly_named.sample(25).sort_values(by='count', ascending=False)

## Subset of corpus: Named arguments
The next steps ar eonly considered the named args events, we keep only named arguments, so we can't tell anymore the named arg ratio. 


In [None]:
strictly_named = df[df["is_named"] == True]

In [None]:
strictly_named

We look at how much of these named arguments are captured by `**KWARGS`

In [None]:
strictly_named.loc[:,"is_kwargs"] = strictly_named['ArgSpec'] == '**KWARGS'

strictly_named_grouped = strictly_named.groupby(["FunctionName", "Keyword"])

In [None]:
strictly_named['is_kwargs'].mean()

In [None]:
build_type_pie_charts(strictly_named['ArgType'].value_counts(),  title="Distribution of ArgType for Named Arguments")


In [None]:
strictly_named.is_literal.mean()

We know focus only on the named args captured by `**KWARGS`

In [None]:
kwargs_captured = strictly_named[strictly_named['ArgSpec'] == '**KWARGS']

In [None]:
kwargs_grouped = kwargs_captured.groupby(["FunctionName"])

In [None]:
kwargs_grouped.apply(lambda group : len(group['Keyword'].unique())).plot(kind='hist', bins=50, log=True)

Now that we have all the args captured by `**KWARGS` we can check how many different keyword are used for these (per function)

In [None]:
call_per_keyword = kwargs_grouped.apply(lambda group : pd.Series({"keyword_count":len(group['Keyword'].unique()),
                                     "total_count":len(group)})).reset_index()

In [None]:
call_per_keyword["avg"] = call_per_keyword['total_count'] / call_per_keyword['keyword_count']
call_per_keyword.sort_values(by="avg", ascending=False)

### Keyword usage over all named arguments
We do the same analysis but over all the named arguments

In [None]:
strictly_named['Keyword'].value_counts().plot(kind='hist', log=True, title='Distribution of keyword usage (logscale)', bins=50)

In [None]:
strictly_named['Keyword'].value_counts()

In [None]:
strictly_named.groupby("Keyword").apply(lambda group: len(group['FunctionName'].unique())).sort_values(ascending=False).plot(kind='hist', log=True, bins=50, title="Dist of # of function per keyword (logscale)")

In [None]:
strictly_named.groupby("Keyword").apply(lambda group: len(group['FunctionName'].unique())).sort_values(ascending=False)

Some metrics around keyword usage (how frequently a keyword is used (over all existing function) and in how many functions. 
The part about literal is here as a bonus (used for the color in the next chart)

In [None]:
def agg_func(group):
    return pd.Series({"call_count":len(group), "function_count":len(group["FunctionName"].unique()),
                     "literal_ratio":group['is_literal'].mean()})


count_and_function = strictly_named.groupby("Keyword").apply(agg_func).reset_index()

In [None]:
count_and_function.plot(kind='scatter', x='function_count', y='call_count', c='literal_ratio', colormap='brg', sharex=False, title="#Call/#Function per keyword")

In [None]:
len(df["Keyword"].unique())

In [None]:
df.columns

In [None]:
function_calls = df.groupby(["FunctionName", "CallID"])

In [None]:
def has_named_arg(function_call):
    non_name_exp_named_arg = sum((function_call["is_named"]) & (function_call['ArgType'] != '*pythonast.NameExpr'))
    non_name_exp_non_named_arg = sum((~function_call["is_named"]) & (function_call['ArgType'] != '*pythonast.NameExpr'))
    non_name_exp = sum(function_call['ArgType'] != '*pythonast.NameExpr')
    
    return pd.Series({"named_arg_count":function_call["is_named"].sum(),
                     "arg_count": len(function_call),
                     "non_name_exp_named_arg": non_name_exp_named_arg, 
                     "non_name_exp_non_named_arg": non_name_exp_non_named_arg,
                     "non_name_exp": non_name_exp} )
                     
calls_summary = function_calls.apply(has_named_arg)

In [None]:
calls_sum_df = calls_summary.reset_index()

In [None]:
calls_sum_df.sample(25)

In [None]:
calls_sum_df['named_arg_count'].plot(kind='hist', bins=20, log=True, title="Distribution of named arg count (logscale)")

In [None]:
calls_sum_df['at_least_one_named'] = calls_sum_df['named_arg_count'] > 0

In [None]:
calls_sum_df['at_least_one_named'].value_counts().plot(kind='pie')

In [None]:
calls_sum_df['at_least_one_non-name_for_non_named'] = calls_sum_df['non_name_exp_non_named_arg'] > 0
print(calls_sum_df['at_least_one_non-name_for_non_named'].mean())
calls_sum_df['at_least_one_non-name_for_non_named'].value_counts().sort_index().plot(kind='pie')

In [None]:
calls_sum_df['at_least_one_non_name_exp_named_arg'] = calls_sum_df['non_name_exp_named_arg'] > 0
print(calls_sum_df['at_least_one_non_name_exp_named_arg'].mean())
calls_sum_df['at_least_one_non_name_exp_named_arg'].value_counts().sort_index().plot(kind='pie')

In [None]:
calls_sum_df['at_least_one_non_name_exp'] = calls_sum_df['non_name_exp'] > 0
print(calls_sum_df['at_least_one_non_name_exp'].mean())
calls_sum_df['at_least_one_non_name_exp'].value_counts().sort_index().plot(kind='pie')

In [None]:
calls_sum_df['at_least_one_named'].mean()

In [None]:
filtered_csd = calls_sum_df[calls_sum_df['arg_count'] > 0]

In [None]:
filtered_csd['named_arg_ratio'] = filtered_csd['named_arg_count'] / filtered_csd['arg_count']

In [None]:
filtered_csd['named_arg_ratio'].plot(kind='hist', bins=50, log=True, title="Named arg ratio (logscale)")

In [None]:
(calls_sum_df['arg_count'].value_counts()/len(calls_sum_df)).plot(kind='pie', title="Arg count distribution")

In [None]:
(calls_sum_df['arg_count'].value_counts()/len(calls_sum_df)).reset_index()

In [None]:
len(calls_sum_df)