In [1]:
#import Libraries
import psycopg2
import numpy as np
import pandas as pd
pd.options.display.max_columns = 300

In [2]:
#connect SQL
conn = psycopg2.connect(database='usaspendingdb', user='postgres', password='Fatih123', host='127.0.0.1', port='5432')

In [3]:
sql_cols = ('federal_action_obligation, '
            'base_and_exercised_options_value, '
            'base_and_all_options_value, '
            'awarding_sub_agency_name, '
            'awarding_office_name, '
            'funding_sub_agency_name, '
            'funding_office_name, '
            'primary_place_of_performance_state_code, '
            'award_or_idv_flag, '
            'award_type, '
            'type_of_contract_pricing, '
            'dod_claimant_program_description, '
            'type_of_set_aside_code, '
            'contract_bundling, '
            'national_interest_action, '
            'gfe_gfp, '
            'contract_financing, '
            'portfolio_group, '
            'product_or_service_code_description, '
            'naics_bucket_title, '
            'naics_description'
            )

In [4]:
#Create DF
df = pd.read_sql_query('SELECT ' + sql_cols + ' FROM consolidated_data_filtered_bucketed', con=conn)
df.shape

(35414, 21)

In [5]:
#Check if there is any null in DF.
df.isnull().sum()

federal_action_obligation                     0
base_and_exercised_options_value              0
base_and_all_options_value                    0
awarding_sub_agency_name                      0
awarding_office_name                          0
funding_sub_agency_name                       0
funding_office_name                          43
primary_place_of_performance_state_code       0
award_or_idv_flag                             0
award_type                                    0
type_of_contract_pricing                      0
dod_claimant_program_description              1
type_of_set_aside_code                      414
contract_bundling                             0
national_interest_action                      0
gfe_gfp                                       0
contract_financing                            0
portfolio_group                               0
product_or_service_code_description           0
naics_bucket_title                         7870
naics_description                       

In [6]:
#Drop null rows from 'type_of_set_aside_code' column.
df = df[pd.notnull(df['type_of_set_aside_code'])]
df.shape

(35000, 21)

In [7]:
def set_aside(c):
    if c['type_of_set_aside_code'] == 'NONE':
        return 0
    else:
        return 1

In [8]:
#Create column name 'set_aside' and apply function to populate rows with 0 or 1.
df['set_aside'] = df.apply(set_aside, axis=1)

In [9]:
def contract_value(c):
    if c['base_and_exercised_options_value'] > 0:
        return c['base_and_exercised_options_value']
    elif c['base_and_all_options_value'] > 0:
        return c['base_and_all_options_value']
    elif c['federal_action_obligation'] > 0:
        return c['federal_action_obligation'] 
    else:
        return 0

In [10]:
df['contract_value'] = df.apply(contract_value, axis=1)

In [11]:
#Drop columns that we dont need anymore.
df = df.drop(['type_of_set_aside_code','base_and_exercised_options_value','base_and_all_options_value',
             'federal_action_obligation'], axis=1)

In [12]:
#Create another DF with non null columns and rows.
df2 = df.dropna()
df2.shape

(27246, 19)

In [13]:
#Create another DF with get_dummies. Binary DF
df3 = pd.get_dummies(df2)

In [15]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix , classification_report
from sklearn.svm import SVC

In [17]:
X = df3.drop(['set_aside'], axis=1)

In [21]:
y = df3['set_aside']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [23]:
model = SVC(kernel = 'rbf')

In [24]:
model.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [25]:
predictions = model.predict(X_test)

In [26]:
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[1963  954]
 [ 818 1715]]
              precision    recall  f1-score   support

           0       0.71      0.67      0.69      2917
           1       0.64      0.68      0.66      2533

    accuracy                           0.67      5450
   macro avg       0.67      0.68      0.67      5450
weighted avg       0.68      0.67      0.68      5450



In [27]:
score = cross_val_score(estimator=model, X=X, y=y, cv=12)



In [28]:
print('Accuracy : ',score.mean())

Accuracy :  0.6830321358167915
