In [270]:
%matplotlib notebook
import pandas as pd
import seaborn as sns
import warnings
import matplotlib as plot
import matplotlib.pyplot as plt
from eda_3 import *

warnings.simplefilter('ignore')
pd.options.display.max_columns = 300
sns.set_style('darkgrid')

In [271]:
raw_data = get_raw_data() #need to change this to a sql query so it will work from anywhere

In [272]:
raw_data['psc_description'] = raw_data['product_or_service_code_description'].copy() #renaming so it doesn't get deleted for having 'code' in header

In [273]:
def initial_scrub(input_df):
    '''description'''
    raw_data_df = input_df.copy()
    list_of_code_headers = list(raw_data_df.filter(regex='code'))
    no_code_df = raw_data_df[raw_data_df.columns.drop(list_of_code_headers)]
    list_of_recipient_headers = list(no_code_df.filter(regex='recipient'))
    no_recipient_df = no_code_df[no_code_df.columns.drop(list_of_recipient_headers)]
    list_of_business_headers = list(no_recipient_df.filter(regex='business'))
    no_business_df = no_recipient_df[no_recipient_df.columns.drop(list_of_business_headers)]
    #need to drop other columns assciated with award winner, since these aren't known when award is announced
    return no_business_df, list_of_code_headers, list_of_recipient_headers, list_of_business_headers

In [274]:
def data_summary(input_df):
    '''this function produces a summary of a raw data set. the summary contains three facts about each column of the
    raw dataset: 1. the non-null count 2. the unique values contained in the column 3. the data type contained in the
    column; this data type is determined by pandas, I think, and might not be exactly correct all of the time'''
    working_df = input_df.copy()
    column_null_counts = working_df.isnull().sum()
    column_non_null_counts = len(working_df)-column_null_counts
    unique_entries_per_column = working_df.nunique()
    data_type = working_df.dtypes
    summary_df = pd.concat([column_non_null_counts,unique_entries_per_column,data_type],axis=1)
    summary_df.columns =  ['non_null_counts','unique_value_counts','data_type']
    return summary_df

In [275]:
smaller_raw_df, dropped_code_headers, dropped_recipient_headers, dropped_business_headers = initial_scrub(raw_data)

In [277]:
len(dropped_code_headers)

63

In [278]:
len(dropped_recipient_headers)

13

In [279]:
len(dropped_business_headers)

22

Include verification that headers containing "code" are redundant

In [280]:
smaller_raw_summary = data_summary(smaller_raw_df)

In [281]:
smaller_raw_summary.head()

Unnamed: 0,non_null_counts,unique_value_counts,data_type
contract_transaction_unique_key,1000000,1000000,object
award_id_piid,1000000,376482,object
modification_number,1000000,2330,object
transaction_number,1000000,19,int64
parent_award_agency_id,780144,30,object


In [217]:
smaller_raw_summary.data_type.value_counts()

object     154
float64     22
int64        3
Name: data_type, dtype: int64

In [218]:
object_column_df = smaller_raw_summary[smaller_raw_summary.data_type=='object']
float64_column_df = smaller_raw_summary[smaller_raw_summary.data_type=='float64']
int64_column_df = smaller_raw_summary[smaller_raw_summary.data_type=='int64']

In [219]:
int64_column_df

Unnamed: 0,non_null_counts,unique_value_counts,data_type
transaction_number,1000000,19,int64
action_date_fiscal_year,1000000,1,int64
number_of_actions,1000000,238,int64


In [220]:
float64_column_df

Unnamed: 0,non_null_counts,unique_value_counts,data_type
federal_action_obligation,1000000,413361,float64
total_dollars_obligated,90525,64257,float64
base_and_exercised_options_value,1000000,405430,float64
current_total_value_of_award,185370,108519,float64
base_and_all_options_value,1000000,400488,float64
potential_total_value_of_award,185370,108561,float64
ordering_period_end_date,0,0,float64
sam_exception,1005,5,float64
primary_place_of_performance_congressional_district,952081,54,float64
idv_type,0,0,float64


Many of the columns that contain data type float64 (above table) appear to be monetary

In [221]:
float64_column_df_null_filtered = float64_column_df[float64_column_df.non_null_counts>0.2*raw_data.shape[0]]

In [222]:
float64_column_df_null_filtered

Unnamed: 0,non_null_counts,unique_value_counts,data_type
federal_action_obligation,1000000,413361,float64
base_and_exercised_options_value,1000000,405430,float64
base_and_all_options_value,1000000,400488,float64
primary_place_of_performance_congressional_district,952081,54,float64
number_of_offers_received,978093,179,float64
number_of_employees,999682,3232,float64
annual_revenue,999682,18759,float64


In [223]:
object_column_df

Unnamed: 0,non_null_counts,unique_value_counts,data_type
contract_transaction_unique_key,1000000,1000000,object
award_id_piid,1000000,376482,object
modification_number,1000000,2330,object
parent_award_agency_id,780144,30,object
parent_award_agency_name,175468,25,object
parent_award_id,780144,34678,object
parent_award_modification_number,803075,614,object
action_date,1000000,365,object
period_of_performance_start_date,1000000,1399,object
period_of_performance_current_end_date,1000000,5059,object


In [224]:
object_column_df_null_filtered = object_column_df[object_column_df.non_null_counts>0.2*raw_data.shape[0]]

In [225]:
object_column_df.shape

(154, 3)

In [226]:
object_column_df_null_filtered.shape

(130, 3)

In [227]:
def get_filtered_summary_list(summary_df,max_null,min_unique,max_unique):
    null_edited_df = summary_df[summary_df.non_null_counts>=max_null].copy()
    unique_edited_df = null_edited_df[(null_edited_df.unique_value_counts>min_unique)&(null_edited_df.unique_value_counts<max_unique)].copy()
    #data_type_edited_df = unique_edited_df[unique_edited_df.data_type.isin(data_type)].copy()
    #data_type_edited_df = unique_edited_df[unique_edited_df.data_type==data_type].copy()
    column_list = unique_edited_df.index.tolist()
    print('the list is length '+str(len(column_list)))
    return unique_edited_df, column_list

In [228]:
edited_summary, test_list = get_filtered_summary_list(smaller_raw_summary,800000,3,50)

the list is length 23


In [229]:
edited_summary

Unnamed: 0,non_null_counts,unique_value_counts,data_type
transaction_number,1000000,19,int64
awarding_sub_agency_name,1000000,24,object
funding_agency_name,1000000,16,object
sam_exception_description,918202,10,object
award_type,1000000,8,object
type_of_contract_pricing,1000000,16,object
contract_bundling,999478,4,object
dod_claimant_program_description,999994,27,object
recovered_materials_sustainability,998416,15,object
domestic_or_foreign_entity,916150,5,object


In [247]:
def column_count_cdf(data_df, column):
    column_summary = pd.DataFrame(columns=['count','percent','percent_running_sum'])
    column_summary['count'] = data_df[column].value_counts()
    nonnulls = data_df[column].value_counts().sum()
    column_summary['percent'] = data_df[column].value_counts()/nonnulls*100
    column_summary['percent_running_sum'] = column_summary['percent'].cumsum()
    column_summary = column_summary.reset_index()
    column_summary = column_summary.drop('index',axis=1)
    return column_summary

def plot_cdf(data_df, columnlist):
    max_x = 0
    for column in columnlist:
        print(column)
        colsummary=column_count_cdf(data_df, column)
        colsummary['percent_running_sum'].plot(linewidth=3)
        if max_x<=max(colsummary['count']):
            max_x = max(colsummary['count'])
        print(max_x)
    plt.legend(columnlist, fontsize=12)
    plt.ylim([-5,105])
    plt.xlim(xmin=-5)
    plt.title('Feature Distribution CDF',fontsize=20)
    plt.xlabel('Feature\'s Unique Value',fontsize=15)
    plt.ylabel('% of Non-nul Values',fontsize=15)

In [248]:
plot_cdf(smaller_raw_df, ['psc_description','naics_description'])

psc_description


<IPython.core.display.Javascript object>

132578
naics_description
181108


In [258]:
column_summary = column_count_cdf(smaller_raw_df,'naics_description')

In [264]:
column_summary

Unnamed: 0,count,percent,percent_running_sum
0,181108,18.153843,18.153843
1,70068,7.023453,25.177295
2,38592,3.868372,29.045667
3,37681,3.777055,32.822723
4,36436,3.652260,36.474982
5,34515,3.459703,39.934685
6,30750,3.082308,43.016993
7,24837,2.489603,45.506596
8,22141,2.219362,47.725958
9,21305,2.135563,49.861522


In [263]:
def get_bucketed_list(column_cdf_df, percent_buckets=20):
    bucketed_list = pd.DataFrame([])
    for bucket in range(1,int(100/percent_bucket_size)+1):
        bucketed_list[i] = column_cdf_df[columns_cdf_df]
    

In [269]:
for i in range(1,int(100/20)+1):
    print(i)

1
2
3
4
5


In [232]:
from yellowbrick.features import Rank2D

In [233]:
test_df = smaller_raw_df[['type_of_set_aside','naics_description','psc_description']][0:100000].copy()

In [234]:
test_df.shape

(100000, 3)

In [235]:
smaller_test_df = test_df[test_df.naics_description.isin(['COMMERCIAL BAKERIES','ALL OTHER MISCELLANEOUS MANUFACTURING','PETROLEUM REFINERIES'])].copy()

In [236]:
smaller_test_df.shape

(11325, 3)

In [237]:
smaller_test_df.isnull().sum()

type_of_set_aside    0
naics_description    0
psc_description      0
dtype: int64

In [238]:
def get_Xy(input_df, target = 'type_of_set_aside'):
    X = input_df.drop(target,axis=1)
    y = input_df[target]
    return X, y

In [239]:
models = [
    SVC(gamma='auto'), LinearSVC(),
    SGDClassifier(max_iter=100, tol=1e-3), KNeighborsClassifier(),
    LogisticRegression(solver='lbfgs'), LogisticRegressionCV(cv=3),
    BaggingClassifier(), ExtraTreesClassifier(n_estimators=300),
    RandomForestClassifier(n_estimators=300)
] #NuSVC(gamma='auto') didn't work because of some nu issue?

In [240]:
def score_model(X, y, estimator, **kwargs):
    """
    Test various estimators.
    """
    y = LabelEncoder().fit_transform(y)
    model = Pipeline([
        ('one_hot_encoder', OneHotEncoder()),
        ('estimator', estimator)
    ])

    # Instantiate the classification model and visualizer
    model.fit(X, y, **kwargs)

    expected  = y
    predicted = model.predict(X)

    # Compute and return F1 (harmonic mean of precision and recall)
    print("{}: {}".format(estimator.__class__.__name__, f1_score(expected, predicted, average='weighted')))


In [241]:
X, y = get_Xy(smaller_test_df)

In [242]:
for model in models:
    score_model(X,y,model)

SVC: 0.9784255221094653
LinearSVC: 0.9793612231584454
SGDClassifier: 0.9786344588841708
KNeighborsClassifier: 0.9784255221094653
LogisticRegression: 0.9784255221094653
LogisticRegressionCV: 0.9784255221094653
BaggingClassifier: 0.9790506062056439
ExtraTreesClassifier: 0.9795675978644472
RandomForestClassifier: 0.9795675978644472


In [243]:
from sklearn.model_selection import train_test_split

In [244]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [245]:
object_column_binary_df = object_column_df[object_column_df.unique_entries_per_column==2].copy()

AttributeError: 'DataFrame' object has no attribute 'unique_entries_per_column'

In [None]:
object_column_binary_df

In [None]:
for i in object_column_binary_df.index:
    print(raw_data[i].value_counts())