# Case study One

In [2]:
import pandas as pd
import warnings
import sys
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge
#from ml_metrics import rmse
#warnings.filterwarnings("ignore")


In [3]:
#Reading in Data
train = pd.read_csv('train.csv')
unique_m = pd.read_csv('unique_m.csv')



In [4]:
 #Dropping critical temp from train
train.drop('critical_temp', inplace=True, axis=1)

In [5]:
unique_m.shape

(21263, 88)

In [6]:
train.shape

(21263, 81)

In [7]:
# Combining datasets
SuperConducters = pd.concat([train,unique_m],axis=1)

In [8]:
# Creating dataframe
SuperConducters = pd.DataFrame(SuperConducters)

In [9]:
SuperConducters.shape

(21263, 169)

In [10]:
# checking to see if all data types are numeric
pd.set_option('display.max_rows', None)
print(SuperConducters.dtypes)
pd.reset_option('display.max_rows')


number_of_elements                   int64
mean_atomic_mass                   float64
wtd_mean_atomic_mass               float64
gmean_atomic_mass                  float64
wtd_gmean_atomic_mass              float64
entropy_atomic_mass                float64
wtd_entropy_atomic_mass            float64
range_atomic_mass                  float64
wtd_range_atomic_mass              float64
std_atomic_mass                    float64
wtd_std_atomic_mass                float64
mean_fie                           float64
wtd_mean_fie                       float64
gmean_fie                          float64
wtd_gmean_fie                      float64
entropy_fie                        float64
wtd_entropy_fie                    float64
range_fie                          float64
wtd_range_fie                      float64
std_fie                            float64
wtd_std_fie                        float64
mean_atomic_radius                 float64
wtd_mean_atomic_radius             float64
gmean_atomi

In [11]:
# Material is an object - deleting
SuperConducters = SuperConducters.drop(['material'], axis=1)

In [12]:
# Checking for missing values
null_counts = SuperConducters.isnull().sum()
null_counts[null_counts>0].sort_values(ascending = False)


Series([], dtype: int64)

In [13]:
# Finding features with only one value
Myunique = []
for col in SuperConducters.columns: 
    if len (SuperConducters[col].unique ()) == 1:
        Myunique.append(col)
print(Myunique)

['Helium', 'Neon', 'Argon', 'Krypton', 'Xenon', 'Promethium', 'Polonium', 'Astatine', 'Radon']


In [14]:
# all nine features with single value have value of 0
SuperConducters[['Helium', 'Neon', 'Argon', 'Krypton', 'Xenon', 'Promethium', 'Polonium', 'Astatine', 'Radon']]

Unnamed: 0,Helium,Neon,Argon,Krypton,Xenon,Promethium,Polonium,Astatine,Radon
0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
21258,0,0,0,0,0,0,0,0,0
21259,0,0,0,0,0,0,0,0,0
21260,0,0,0,0,0,0,0,0,0
21261,0,0,0,0,0,0,0,0,0


In [15]:
# Dropping features wih only 0 values

SuperConducters = SuperConducters.drop(
    ['Helium', 'Neon', 'Argon', 'Krypton', 'Xenon', 'Promethium', 'Polonium', 'Astatine', 'Radon'], axis=1)

In [16]:
# checking to see if all data types are numeric
pd.set_option('display.max_rows', None)
print(SuperConducters.dtypes)
pd.reset_option('display.max_rows')


number_of_elements                   int64
mean_atomic_mass                   float64
wtd_mean_atomic_mass               float64
gmean_atomic_mass                  float64
wtd_gmean_atomic_mass              float64
entropy_atomic_mass                float64
wtd_entropy_atomic_mass            float64
range_atomic_mass                  float64
wtd_range_atomic_mass              float64
std_atomic_mass                    float64
wtd_std_atomic_mass                float64
mean_fie                           float64
wtd_mean_fie                       float64
gmean_fie                          float64
wtd_gmean_fie                      float64
entropy_fie                        float64
wtd_entropy_fie                    float64
range_fie                          float64
wtd_range_fie                      float64
std_fie                            float64
wtd_std_fie                        float64
mean_atomic_radius                 float64
wtd_mean_atomic_radius             float64
gmean_atomi

In [17]:
SuperConducters.shape

(21263, 159)

In [18]:
# Making X and Y

x = SuperConducters.drop(['critical_temp'], axis=1)
y = SuperConducters['critical_temp']

In [19]:
print('x shape = ',x.shape )
print('y shape = ',y.shape )

x shape =  (21263, 158)
y shape =  (21263,)


# Normalizing Data

In [20]:
scaler = StandardScaler()

In [21]:
# normalizing x
x_scaled = scaler.fit_transform(x)

In [22]:
x_scaled

array([[-0.0800575 ,  0.04673292, -0.45165095, ..., -0.17611284,
        -0.15476395, -0.30645744],
       [ 0.6147436 ,  0.17426938, -0.43207104, ..., -0.17611284,
        -0.15476395, -0.30645744],
       [-0.0800575 ,  0.04673292, -0.45097762, ..., -0.17611284,
        -0.15476395, -0.30645744],
       ...,
       [-1.46965972,  0.40792698,  0.67545689, ..., -0.17611284,
        -0.15476395, -0.30645744],
       [-1.46965972,  0.40792698,  0.71984372, ..., -0.17611284,
        -0.15476395, -0.30645744],
       [-0.77485861, -0.0030091 ,  0.41416385, ..., -0.17611284,
        -0.15476395, -0.30645744]])

In [23]:
# adding names back to new x_scaled
feature_names = list(x)
x_scaled = pd.DataFrame(x_scaled, columns = feature_names) 

In [24]:
x_scaled

Unnamed: 0,number_of_elements,mean_atomic_mass,wtd_mean_atomic_mass,gmean_atomic_mass,wtd_gmean_atomic_mass,entropy_atomic_mass,wtd_entropy_atomic_mass,range_atomic_mass,wtd_range_atomic_mass,std_atomic_mass,...,Tungsten,Rhenium,Osmium,Iridium,Platinum,Gold,Mercury,Thallium,Lead,Bismuth
0,-0.080058,0.046733,-0.451651,-0.158850,-0.611819,0.044358,-0.003707,0.133725,-0.053039,0.378186,...,-0.063318,-0.032448,-0.079756,-0.071179,-0.110785,-0.028601,-0.178115,-0.176113,-0.154764,-0.306457
1,0.614744,0.174269,-0.432071,0.059368,-0.604180,0.777430,-0.015267,0.133725,0.108900,0.134901,...,-0.063318,-0.032448,-0.079756,-0.071179,-0.110785,-0.028601,-0.178115,-0.176113,-0.154764,-0.306457
2,-0.080058,0.046733,-0.450978,-0.158850,-0.611658,0.044358,-0.218984,0.133725,0.093294,0.378186,...,-0.063318,-0.032448,-0.079756,-0.071179,-0.110785,-0.028601,-0.178115,-0.176113,-0.154764,-0.306457
3,-0.080058,0.046733,-0.451314,-0.158850,-0.611739,0.044358,-0.103615,0.133725,0.020128,0.378186,...,-0.063318,-0.032448,-0.079756,-0.071179,-0.110785,-0.028601,-0.178115,-0.176113,-0.154764,-0.306457
4,-0.080058,0.046733,-0.452324,-0.158850,-0.611980,0.044358,0.162775,0.133725,-0.199372,0.378186,...,-0.063318,-0.032448,-0.079756,-0.071179,-0.110785,-0.028601,-0.178115,-0.176113,-0.154764,-0.306457
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21258,-0.080058,0.653740,-0.593991,0.361744,-0.420307,0.031615,0.473913,0.572626,-0.657124,1.066745,...,-0.063318,-0.032448,-0.079756,-0.071179,-0.110785,-0.028601,-0.178115,-0.176113,-0.154764,-0.306457
21259,0.614744,0.158685,-0.715653,-0.208768,-0.700465,0.432089,1.264895,1.332391,-0.959386,1.239871,...,-0.063318,-0.032448,-0.079756,-0.071179,-0.110785,-0.028601,-0.178115,3.496426,-0.154764,-0.306457
21260,-1.469660,0.407927,0.675457,0.906983,1.007482,-1.300993,-1.329515,-1.868861,0.734817,-1.878471,...,-0.063318,-0.032448,-0.079756,-0.071179,-0.110785,-0.028601,-0.178115,-0.176113,-0.154764,-0.306457
21261,-1.469660,0.407927,0.719844,0.906983,1.046684,-1.300993,-1.053776,-1.868861,-0.078244,-1.878471,...,-0.063318,-0.032448,-0.079756,-0.071179,-0.110785,-0.028601,-0.178115,-0.176113,-0.154764,-0.306457


# Linear Regression Model

## Creating train and test

In [25]:
x_scaled.shape

(21263, 158)

In [26]:
# Create training and testing sets (cross-validation not needed)
train_set = x_scaled.sample(frac=0.7, random_state=100)
test_set = x_scaled[~x_scaled.isin(train_set)].dropna()
print('train_set -',train_set.shape[0])
print('test_set -', test_set.shape[0])
print('train_set + test_set = x_scaled -',train_set.shape[0]+test_set.shape[0])

train_set - 14884
test_set - 6379
train_set + test_set = x_scaled - 21263


In [27]:
pd.reset_option('display.max_rows')
train_set.head()

Unnamed: 0,number_of_elements,mean_atomic_mass,wtd_mean_atomic_mass,gmean_atomic_mass,wtd_gmean_atomic_mass,entropy_atomic_mass,wtd_entropy_atomic_mass,range_atomic_mass,wtd_range_atomic_mass,std_atomic_mass,...,Tungsten,Rhenium,Osmium,Iridium,Platinum,Gold,Mercury,Thallium,Lead,Bismuth
19769,-0.774859,0.084872,0.218639,0.408731,0.474548,-0.38021,0.07644,-0.64785,-1.025344,-0.46912,...,-0.063318,-0.032448,-0.079756,-0.071179,-0.110785,-0.028601,-0.178115,-0.176113,-0.154764,-0.306457
10404,-0.774859,-0.72632,-0.250052,-0.241872,0.098392,-0.280359,-0.55501,-1.378818,-0.170709,-1.320008,...,-0.063318,-0.032448,-0.079756,-0.071179,-0.110785,-0.028601,-0.178115,-0.176113,-0.154764,-0.306457
1829,1.309545,0.197243,-0.392217,-0.074841,-0.608688,1.037617,0.975193,1.416547,-0.286511,1.051086,...,-0.063318,-0.032448,-0.079756,-0.071179,-0.110785,-0.028601,-0.178115,-0.176113,-0.154764,2.742732
15434,-1.46966,2.039376,1.973764,2.465481,2.197477,-1.299951,-2.496088,-1.780424,3.809035,-1.757909,...,-0.063318,-0.032448,-0.079756,-0.071179,-0.110785,-0.028601,-0.178115,-0.176113,-0.154764,-0.306457
16179,-0.774859,0.051026,0.480255,0.55135,0.814698,-0.203982,0.067977,-1.637563,-0.908851,-1.677514,...,-0.063318,-0.032448,-0.079756,-0.071179,-0.110785,-0.028601,-0.178115,-0.176113,-0.154764,-0.306457


In [28]:
lr_scaled = LinearRegression()

In [29]:
lr_scaled.fit(x_scaled,y) #x is your input, y is your target

LinearRegression()

In [30]:
#for i in range(len(x_scaled.columns)):
 #   print(x_scaled.columns[i], lr_scaled.coef_[i])

In [31]:
lr_scaled_coef = pd.DataFrame(lr_scaled.coef_)
feature_names = pd.DataFrame(feature_names)

In [32]:
lr_scaled_coefs = pd.concat([feature_names,lr_scaled_coef],axis=1)
lr_scaled_coefs.columns = ['Features', 'Coefficients']

In [33]:
lr_scaled_coefs

Unnamed: 0,Features,Coefficients
0,number_of_elements,0.210294
1,mean_atomic_mass,25.033679
2,wtd_mean_atomic_mass,-39.990002
3,gmean_atomic_mass,-12.373361
4,wtd_gmean_atomic_mass,24.979403
...,...,...
153,Gold,-0.475718
154,Mercury,1.262132
155,Thallium,1.523480
156,Lead,0.434396


In [34]:
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.9f' % x)
lr_scaled_coefs.sort_values(by='Coefficients',ascending=False,key=abs)

Unnamed: 0,Features,Coefficients
24,wtd_gmean_atomic_radius,-64.298730046
22,wtd_mean_atomic_radius,60.118301582
2,wtd_mean_atomic_mass,-39.990001829
76,wtd_entropy_Valence,-27.304462094
52,wtd_mean_FusionHeat,-26.436488551
1,mean_atomic_mass,25.033679346
4,wtd_gmean_atomic_mass,24.979402853
49,std_ElectronAffinity,24.827528238
62,wtd_mean_ThermalConductivity,23.100941917
75,entropy_Valence,23.095734647


In [35]:
lr_scaled.score(x_scaled,y)

0.7646191632540855