# <p class="lead"><i>This Jupyter notebook performs Feature Selection and Prediction </i></p>

In [10]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import *
lr=linear_model.LinearRegression()
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score

#os.chdir('ozonedaily')
df1 =  pd.read_csv("Summarized_daily_ozone.csv",low_memory=False)
df1 = df1.drop('Method_Code',axis=1)
train, test = train_test_split(df1, test_size = 0.2)

def calculatestat(lr,xaxis,yaxis):
    y_pred=lr.predict(xaxis)
    MAE=mean_absolute_error(yaxis,y_pred)
    print("Mean Absolute Error: ", MAE) 
    RMSE=math.sqrt(mean_squared_error(yaxis,y_pred))
    print("Root of Mean Squared Deviation: ",RMSE)
    
def LinearRegressionAnalysis(xtrain,ytrain,xtest,ytest):
    trainreg  = lr.fit(xtrain, ytrain)
    print("---------------Linear Regression---------------")
    print("Train Data:")
    print(calculatestat(trainreg, xtrain, ytrain ))
    print("Test Data:")
    print(calculatestat(trainreg, xtest, ytest))
    
def Random_Forest(xtrain,ytrain,xtest,ytest):
    rand_forest = RandomForestRegressor(n_estimators=15,max_depth=10)
    rand_forest = rand_forest.fit(xtrain,ytrain)
    print("---------------Random Forest---------------")
    print("Train Data:")
    print(calculatestat(rand_forest, xtrain, ytrain ))
    print("Test Data:")
    print(calculatestat(rand_forest,xtest,ytest))
    
def Neural_Network(xtrain,ytrain,xtest,ytest):
    #neural network
    xtrain = StandardScaler().fit_transform(xtrain)
    xtest = StandardScaler().fit_transform(xtest)
    mlp = MLPRegressor( hidden_layer_sizes=20,max_iter = 200, shuffle=True, random_state=1)
    fit = mlp.fit(xtrain, ytrain)
    print("---------------Neural Forest---------------")
    print("Train Data:")
    print(calculatestat(fit, xtrain, ytrain ))
    print("Test Data:")
    print(calculatestat(fit,xtest,ytest))
    print("Training score is ",mlp.score(xtrain, ytrain))
    print("Testing score is ",mlp.score(xtest, ytest))

    
ytrain = train.AQI
xtrain = train.drop('AQI',axis=1)._get_numeric_data()
ytest = test.AQI
xtest = test.drop('AQI',axis=1)._get_numeric_data()
LinearRegressionAnalysis(xtrain,ytrain,xtest,ytest)
Random_Forest(xtrain,ytrain,xtest,ytest)
Neural_Network(xtrain,ytrain,xtest,ytest)

---------------Linear Regression---------------
Train Data:
Mean Absolute Error:  4.38988828139
Root of Mean Squared Deviation:  6.745477074883096
None
Test Data:
Mean Absolute Error:  4.38801552929
Root of Mean Squared Deviation:  6.734475024312339
None
---------------Random Forest---------------
Train Data:
Mean Absolute Error:  0.00604447918434
Root of Mean Squared Deviation:  0.3431471448000984
None
Test Data:
Mean Absolute Error:  0.00983101470187
Root of Mean Squared Deviation:  0.6193625859253024
None
---------------Random Forest---------------
Train Data:
Mean Absolute Error:  0.280208257634
Root of Mean Squared Deviation:  0.7601540046003697
None
Test Data:
Mean Absolute Error:  0.279682262218
Root of Mean Squared Deviation:  0.7310861555278891
None
Training score is  0.998338120965
Testing score is  0.998459721081


In [45]:
from sklearn.feature_selection import RFE
from sklearn import feature_selection
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import f_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RandomizedLasso
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import r2_score
import numpy as np
#os.chdir('ozonedaily')
df1 =  pd.read_csv("Summarized_daily_ozone.csv",low_memory=False)
df1 = df1.drop(['Method_Code','CBSA_Name','Local_Site_Name','Parameter_Code'],axis=1)

train, test = train_test_split(df1, test_size = 0.2)
ytrain = train.AQI
xtrain = train.drop('AQI',axis=1)._get_numeric_data()
ytest = test.AQI
xtest = test.drop('AQI',axis=1)._get_numeric_data()

lr = LinearRegression()

rfe = RFE(lr, n_features_to_select=6,step=1)
rfe = rfe.fit(xtrain,ytrain)
s1 = r2_score(ytrain,rfe.predict(xtrain))
print ("RFE: ")
print ("RFE Features sorted by their rank: ",s1)
#print (list(zip(xtrain.columns,rfe.ranking_)))
print(list(zip(map(lambda x: round(x, 2), rfe.ranking_),xtrain )))
print ("--------------------------------------------------")

print ("K best:")
feat=SelectKBest(score_func = f_classif ,k=6) 
feat = feat.fit(xtrain,ytrain)
x_train = feat.fit_transform(xtrain,ytrain)
kbest = lr.fit(x_train,ytrain)
score = lr.score(x_train, ytrain)
sc=r2_score(ytrain,lr.predict(x_train))
print("K best score:", sc)
print(list(zip(xtrain,K.scores_)))
print("--------------------------------------------------")

rlasso = RandomizedLasso(alpha= 'bic',verbose=False, n_resampling=100, n_jobs=1)
rlasso.fit(xtrain,ytrain)
print("Randomized Lasso:")
print("Features sorted by their score:")
coef = pd.DataFrame(rlasso.scores_, columns = ['RandomizedLasso_score'])
print (list(zip(map(lambda x: round(x, 2), rlasso.scores_), xtrain.columns.values)))
print("--------------------------------------------------")

print("Percentile Selection:")
fs = SelectPercentile(score_func = f_regression, percentile=20)
fs = fs.fit(xtrain,ytrain)
print('Features sorted by score:')
print(list(zip(map(lambda x: round(x, 2), fs.scores_), xtrain.columns.values)))


RFE: 
RFE Features sorted by their rank:  0.868466288479
[(5, 'State_Code'), (1, 'POC'), (3, 'Latitude'), (2, 'Longitude'), (4, 'Date_Local'), (1, 'Observation_Count'), (1, 'Observation_Percent'), (1, 'Arithmetic_Mean'), (1, '1st_Max_Value'), (1, '1st_Max_Hour')]
--------------------------------------------------
K best:
K best score: 0.868638528201
[('State_Code', 111.5073414900608), ('POC', 20.382604469602214), ('Latitude', 28.457504351906994), ('Longitude', 214.35450411712358), ('Date_Local', 123.87154278297174), ('Observation_Count', 119.68546917754333), ('Observation_Percent', 122.16613516073123), ('Arithmetic_Mean', 52604.993448261586), ('1st_Max_Value', 6683471.7552595995), ('1st_Max_Hour', 29.792727587398705)]
--------------------------------------------------
Randomized Lasso:
Features sorted by their score:
[(1.0, 'State_Code'), (1.0, 'POC'), (1.0, 'Latitude'), (1.0, 'Longitude'), (1.0, 'Date_Local'), (1.0, 'Observation_Count'), (1.0, 'Observation_Percent'), (1.0, 'Arithmetic