In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

Here are examples of loading data with different formats. 

In [8]:
import os
def read_csv_space(path, file):
    return pd.read_csv(os.path.join(path, file), encoding='', sep='\s+', index_col=False, engine='python') 

def read_csv_comma(path, file):
    return pd.read_csv(os.path.join(path, file), encoding='', sep=',', engine='python') 

def json_load(path, file):
    with open(os.path.join(path, file), encoding='', mode='r') as f:
        v = json.load(f)
        f.close()
    return v

def read_text(path, file):
    with open(os.path.join(path, file),encoding='', mode='r') as f:
        lines = []
        for l in f:
            lines.append(l.rstrip('\n').split('\t'))
    return lines 

def table_load(self, path, file_name):
    df = pd.read_excel(os.path.join(path, file_name), engine='openpyxl', sheet_name=0)
    mylist = df['words'].tolist()
    return mylist

We still use the Boston housing price data for demostration. 

In [9]:
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning) 
from sklearn.datasets import load_boston
boston=load_boston()
boston_df=pd.DataFrame(boston.data,columns=boston.feature_names)
boston_df['Price']=boston.target
xmat=boston_df.drop('Price',axis=1)
y =boston_df['Price']
X_list = xmat.columns 
print(X_list)

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT'],
      dtype='object')


Split data into training and test sample. 

In [10]:
X_train,X_test,y_train,y_test=train_test_split(xmat,y,test_size=0.2,random_state=3)

Linear regression 

In [17]:
reg = LinearRegression()
reg.fit(X_train, y_train)
train_score=reg.score(X_train,y_train)
test_score=reg.score(X_test,y_test)
print(test_score)
print(reg.coef_)
yhat = reg.predict(X_test)

0.7952617563243852
[-1.23897571e-01  4.81822924e-02 -4.74497796e-02  3.36938950e+00
 -1.56635488e+01  3.59419367e+00 -9.33206067e-03 -1.47089101e+00
  3.05053544e-01 -1.08397039e-02 -9.08791339e-01  1.00352939e-02
 -4.77714677e-01]


In [21]:
lasso = Lasso(alpha=1)
lasso.fit(X_train,y_train)
train_score=lasso.score(X_train,y_train)
test_score=lasso.score(X_test,y_test)
coeff_used = np.sum(lasso.coef_!=0)
print(test_score)
print(lasso.coef_)

0.71843103569211
[-0.08159032  0.05363088 -0.03523796  0.         -0.          0.70939671
  0.00989557 -0.72872731  0.26217484 -0.01333177 -0.69820833  0.00823025
 -0.73035067]


In [14]:
elas = ElasticNet(alpha=0.01, l1_ratio=0.5)
elas.fit(X_train, y_train)
train_score=elas.score(X_train,y_train)
test_score=elas.score(X_test,y_test)
print(test_score)


0.7840281830284848


In [16]:
model = Lasso()
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid
grid = dict()
grid['alpha'] = np.arange(0.001, 1, 0.01)
# define search
search = GridSearchCV(model, grid, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(xmat, y)
# summarize
print('MAE: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

MAE: -3.378
Config: {'alpha': 0.011}
