#### Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

#### Reading the data

In [2]:
df = pd.read_csv('../data/large_clean.csv')
test = pd.read_csv('../data/test_clean.csv')

#### Feature Engineering

In [3]:
#converting our categorical features to dummies
df = pd.get_dummies(df, drop_first=True)
df_test = pd.get_dummies(test, drop_first=True)

In [4]:
#most important features
features = ['fnlwgt', 'age', 'education-num', 'capital-gain', 'marital-status_Married-civ-spouse','hours-per-week']

In [5]:
#takes the important featurs from the original df
X = df[features]
X_test = df_test[features]

In [6]:
#using PolynomialFeatures() to create interaction terms between the most importent features

#instituting
pf = PolynomialFeatures(include_bias=False, degree=2, interaction_only=False)
#fitting and transforming
X_poly = pf.fit_transform(X)

#fitting and transforming
X_test_poly = pf.fit_transform(X_test)


In [7]:
#creating a dataFrame from the polyed features
combined = pd.DataFrame(X_poly, columns =pf.get_feature_names(features))

combined_test = pd.DataFrame(X_test_poly, columns =pf.get_feature_names(features))

In [8]:
#dropping the duplicate columns 
combined.drop(['fnlwgt', 'age', 'education-num', 'capital-gain','marital-status_Married-civ-spouse', 'hours-per-week'], 
              axis=1, 
              inplace=True)

combined_test.drop(['fnlwgt', 'age', 'education-num', 'capital-gain','marital-status_Married-civ-spouse', 'hours-per-week'], 
              axis=1, 
              inplace=True)

In [10]:
combined_test.head()

Unnamed: 0,fnlwgt^2,fnlwgt age,fnlwgt education-num,fnlwgt capital-gain,fnlwgt marital-status_Married-civ-spouse,fnlwgt hours-per-week,age^2,age education-num,age capital-gain,age marital-status_Married-civ-spouse,...,education-num^2,education-num capital-gain,education-num marital-status_Married-civ-spouse,education-num hours-per-week,capital-gain^2,capital-gain marital-status_Married-civ-spouse,capital-gain hours-per-week,marital-status_Married-civ-spouse^2,marital-status_Married-civ-spouse hours-per-week,hours-per-week^2
0,51439150000.0,5670050.0,1587614.0,0.0,0.0,9072080.0,625.0,175.0,0.0,0.0,...,49.0,0.0,0.0,280.0,0.0,0.0,0.0,0.0,0.0,1600.0
1,8066555000.0,3412932.0,808326.0,0.0,89814.0,4490700.0,1444.0,342.0,0.0,38.0,...,81.0,0.0,9.0,450.0,0.0,0.0,0.0,1.0,50.0,2500.0
2,113536000000.0,9434628.0,4043412.0,0.0,336951.0,13478040.0,784.0,336.0,0.0,28.0,...,144.0,0.0,12.0,480.0,0.0,0.0,0.0,1.0,40.0,1600.0
3,25703460000.0,7054212.0,1603230.0,1232563000.0,160323.0,6412920.0,1936.0,440.0,338272.0,44.0,...,100.0,76880.0,10.0,400.0,59105344.0,7688.0,307520.0,1.0,40.0,1600.0
4,10711630000.0,1862946.0,1034970.0,0.0,0.0,3104910.0,324.0,180.0,0.0,0.0,...,100.0,0.0,0.0,300.0,0.0,0.0,0.0,0.0,0.0,900.0


In [11]:
#combining the polyed columns with the original df.
polyed_data = combined.merge(df, left_index=True, right_index=True)
polyed_data_test = combined_test.merge(df_test, left_index=True, right_index=True)

In [12]:
polyed_data_test.shape

(16281, 116)

#### Saving the Data Frame

In [13]:
polyed_data.to_csv('../data/polyed_clean.csv', index=False)

In [14]:
polyed_data_test.to_csv('../data/polyed_test_clean.csv', index=False)