In [1]:
#import Libraries
import psycopg2
import numpy as np
import pandas as pd
pd.options.display.max_columns = 300

In [2]:
#connect SQL
conn = psycopg2.connect(database='usaspendingdb', user='postgres', password='Fatih123', host='127.0.0.1', port='5432')

In [3]:
sql_cols = ('federal_action_obligation, '
            'base_and_exercised_options_value, '
            'base_and_all_options_value, '
            'awarding_sub_agency_name, '
            'awarding_office_name, '
            'funding_sub_agency_name, '
            'funding_office_name, '
            'primary_place_of_performance_state_code, '
            'award_or_idv_flag, '
            'award_type, '
            'type_of_contract_pricing, '
            'dod_claimant_program_description, '
            'type_of_set_aside_code, '
            'contract_bundling, '
            'national_interest_action, '
            'gfe_gfp, '
            'contract_financing, '
            'portfolio_group, '
            'product_or_service_code_description, '
            'naics_bucket_title, '
            'naics_description'
            )

In [4]:
#Create DF
df = pd.read_sql_query('SELECT ' + sql_cols + ' FROM consolidated_data_filtered_bucketed', con=conn)
df.shape

(35414, 21)

In [5]:
#Check if there is any null in DF.
df.isnull().sum()

federal_action_obligation                     0
base_and_exercised_options_value              0
base_and_all_options_value                    0
awarding_sub_agency_name                      0
awarding_office_name                          0
funding_sub_agency_name                       0
funding_office_name                          43
primary_place_of_performance_state_code       0
award_or_idv_flag                             0
award_type                                    0
type_of_contract_pricing                      0
dod_claimant_program_description              1
type_of_set_aside_code                      414
contract_bundling                             0
national_interest_action                      0
gfe_gfp                                       0
contract_financing                            0
portfolio_group                               0
product_or_service_code_description           0
naics_bucket_title                         7870
naics_description                       

In [6]:
#Drop null rows from 'type_of_set_aside_code' column.
df = df[pd.notnull(df['type_of_set_aside_code'])]
df.shape

(35000, 21)

In [7]:
df['type_of_set_aside_code'].value_counts()

NONE       22154
SBA         8619
WOSB        1379
8A          1222
8AN         1185
SDVOSBC      271
HZC          108
SBP           30
SDVOSBS       14
EDWOSB         7
WOSBSS         6
HZS            4
ISBEE          1
Name: type_of_set_aside_code, dtype: int64

In [8]:
df['set_aside_number'] = df['type_of_set_aside_code'].map({'NONE':1, 'SBA':2, 'WOSB':3, '8A':4, '8AN':5, 'SDVOSBC':6,'HZC':7,
                                                           'SBP':8, 'SDVOSBS':9, 'EDWOSB':10, 'WOSBSS':11, 'HZS':12, 'ISBEE':13})

In [9]:
df['set_aside_number'].value_counts()

1     22154
2      8619
3      1379
4      1222
5      1185
6       271
7       108
8        30
9        14
10        7
11        6
12        4
13        1
Name: set_aside_number, dtype: int64

In [10]:
def contract_value(c):
    if c['base_and_exercised_options_value'] > 0:
        return c['base_and_exercised_options_value']
    elif c['base_and_all_options_value'] > 0:
        return c['base_and_all_options_value']
    elif c['federal_action_obligation'] > 0:
        return c['federal_action_obligation'] 
    else:
        return 0

In [11]:
df['contract_value'] = df.apply(contract_value, axis=1)

In [12]:
#Drop columns that we dont need anymore.
df = df.drop(['type_of_set_aside_code','base_and_exercised_options_value','base_and_all_options_value',
             'federal_action_obligation'], axis=1)

In [13]:
#Create another DF with non null columns and rows.
df2 = df.dropna()
df2.shape

(27246, 19)

In [14]:
df2.head()

Unnamed: 0,awarding_sub_agency_name,awarding_office_name,funding_sub_agency_name,funding_office_name,primary_place_of_performance_state_code,award_or_idv_flag,award_type,type_of_contract_pricing,dod_claimant_program_description,contract_bundling,national_interest_action,gfe_gfp,contract_financing,portfolio_group,product_or_service_code_description,naics_bucket_title,naics_description,set_aside_number,contract_value
0,USTRANSCOM,USTRANSCOM-AQ,DEPT OF THE AIR FORCE,HQ AMC TE,TX,AWARD,DO,FIRM FIXED PRICE,SERVICES,NOT A BUNDLED REQUIREMENT,NONE,TRANSACTION DOES NOT USE GFE/GFP,NOT APPLICABLE,Transportation Services,TRANSPORTATION/TRAVEL/RELOCATION- TRAVEL/LODGI...,Air TransportationT,NONSCHEDULED CHARTERED PASSENGER AIR TRANSPORT...,1,64925.82
1,WASHINGTON HEADQUARTERS SERVICES (WHS),WASHINGTON HEADQUARTERS SERVICES,WASHINGTON HEADQUARTERS SERVICES (WHS),PENTAGON FORCE PROTECTION AGENCY,VA,AWARD,DO,FIRM FIXED PRICE,SERVICES,NOT A BUNDLED REQUIREMENT,NONE,TRANSACTION DOES NOT USE GFE/GFP,NOT APPLICABLE,Electronic & Communication Services,"INSTALLATION OF EQUIPMENT- ALARM, SIGNAL, AND ...",Administrative and Support and Waste Managemen...,SECURITY SYSTEMS SERVICES (EXCEPT LOCKSMITHS),1,4897.96
3,DEFENSE LOGISTICS AGENCY,DLA DISPOSTION SERVICE - EBS,DEFENSE LOGISTICS AGENCY,DLA DISPOSTION SERVICE - EBS,WI,AWARD,DO,FIRM FIXED PRICE,SERVICES,NOT A BUNDLED REQUIREMENT,NONE,TRANSACTION DOES NOT USE GFE/GFP,NOT APPLICABLE,Facility Related Services,HOUSEKEEPING- WASTE TREATMENT/STORAGE,Administrative and Support and Waste Managemen...,HAZARDOUS WASTE TREATMENT AND DISPOSAL,4,5618.14
4,DEFENSE INFORMATION SYSTEMS AGENCY (DISA),DITCO-SCOTT,DEPT OF THE ARMY,W4NH NETCOM HHC,TX,AWARD,BPA CALL,FIRM FIXED PRICE,SERVICES,NOT A BUNDLED REQUIREMENT,NONE,TRANSACTION DOES NOT USE GFE/GFP,NOT APPLICABLE,Electronic & Communication Services,IT AND TELECOM- TELECOMMUNICATIONS AND TRANSMI...,InformationT,SATELLITE TELECOMMUNICATIONS,1,163.92
7,USTRANSCOM,USTRANSCOM-AQ,DEPT OF THE AIR FORCE,HQ AMC TE,KS,AWARD,DO,FIRM FIXED PRICE,SERVICES,NOT A BUNDLED REQUIREMENT,NONE,TRANSACTION DOES NOT USE GFE/GFP,NOT APPLICABLE,Transportation Services,TRANSPORTATION/TRAVEL/RELOCATION- TRAVEL/LODGI...,Air TransportationT,NONSCHEDULED CHARTERED PASSENGER AIR TRANSPORT...,1,220000.0


In [15]:
df2.describe()

Unnamed: 0,set_aside_number,contract_value
count,27246.0,27246.0
mean,1.803274,459681.9
std,1.217351,6394049.0
min,1.0,0.01
25%,1.0,2400.0
50%,1.0,9800.0
75%,2.0,76468.18
max,12.0,699368600.0


In [16]:
df3 = pd.get_dummies(df2)

In [17]:
X = df3.drop(['set_aside_number'], axis=1)

In [18]:
y = df3['set_aside_number']

In [19]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix , classification_report
from sklearn.svm import SVC

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [21]:
model = SVC(kernel = 'rbf')

In [23]:
model.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [24]:
predictions = model.predict(X_test)

In [25]:
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[2304  594    4    4    8    3    0    0    0    0    0]
 [ 835  811    8    5    0    2    0    0    0    0    0]
 [ 119  158   13    0    0    0    0    0    0    0    0]
 [ 135  100    0   11    0    0    0    0    0    0    0]
 [ 164   78    2    1    5    0    0    0    0    0    0]
 [  37   13    1    0    0    1    0    0    0    0    0]
 [  19    6    0    0    0    0    0    0    0    0    0]
 [   4    1    0    0    0    0    0    0    0    0    0]
 [   1    0    0    0    0    0    0    0    0    0    0]
 [   2    0    0    0    0    0    0    0    0    0    0]
 [   1    0    0    0    0    0    0    0    0    0    0]]
              precision    recall  f1-score   support

           1       0.64      0.79      0.70      2917
           2       0.46      0.49      0.47      1661
           3       0.46      0.04      0.08       290
           4       0.52      0.04      0.08       246
           5       0.38      0.02      0.04       250
           6       0.17      0.02   

  'precision', 'predicted', average, warn_for)


In [28]:
score = cross_val_score(estimator=model, X=X, y=y, cv=12)



In [29]:
print('Accuracy : ',score.mean())

Accuracy :  0.5882525451078138
