# 1.Importing Packages

In [1]:
import pandas as pd
import numpy as np

# 2.Loading Dataset From UCI

## 2.1 Data Description

Predicting the age of abalone from physical measurements.

Name		Data Type	Meas.	Description
	----		---------	-----	-----------
	Sex		nominal			M, F, and I (infant)
	Length		continuous	mm	Longest shell measurement
	Diameter	continuous	mm	perpendicular to length
	Height		continuous	mm	with meat in shell
	Whole weight	continuous	grams	whole abalone
	Shucked weight	continuous	grams	weight of meat
	Viscera weight	continuous	grams	gut weight (after bleeding)
	Shell weight	continuous	grams	after being dried
	Rings		integer			age(target values)

## 2.2 Data Overview

In [2]:
raw_data = pd.read_csv('/home/llw/py_scripts/6_28_presentation/data_set/abalone.csv',header=None) #load data
print(raw_data.head())
print('\n')
print(raw_data.info())

   0      1      2      3       4       5       6      7   8
0  M  0.455  0.365  0.095  0.5140  0.2245  0.1010  0.150  15
1  M  0.350  0.265  0.090  0.2255  0.0995  0.0485  0.070   7
2  F  0.530  0.420  0.135  0.6770  0.2565  0.1415  0.210   9
3  M  0.440  0.365  0.125  0.5160  0.2155  0.1140  0.155  10
4  I  0.330  0.255  0.080  0.2050  0.0895  0.0395  0.055   7


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
0    4177 non-null object
1    4177 non-null float64
2    4177 non-null float64
3    4177 non-null float64
4    4177 non-null float64
5    4177 non-null float64
6    4177 non-null float64
7    4177 non-null float64
8    4177 non-null int64
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB
None


## 2.3 Data Mapping and Rebuilding

In [13]:
raw_data = pd.read_csv('/home/llw/py_scripts/6_28_presentation/data_set/abalone.csv',header=None) #load data
attributes = ['sex','length','diameter','height','wh_wt','sk_wt','vr_wt','sl_wt','age']
features =  ['sex','length','diameter','height','wh_wt','sk_wt','vr_wt','sl_wt']
raw_data.columns = attributes
#sex mapping
sex_mapping = {'I':0,'F':1,'M':2}    #mapping strings to intergers
raw_data = raw_data.replace(sex_mapping)
#age mapping
age_col = np.zeros(raw_data['age'].shape[0])
age_col+= (10<np.array(raw_data['age'])).astype(int)*(20>=np.array(raw_data['age'])).astype(int)   # 10<age<=20 is class 1
age_col+= (20<np.array(raw_data['age'])).astype(int)*2

# 3 Training

In [4]:
class LeastSquare():
    def __init__(self):
        self.paras = []    #parameters
    def fit(self,x,y):
        x = self.x_process(x)
        y = self.y_process(y)
        Xt = np.transpose(x)
        XtX = np.dot(Xt,x)
        X_cross = np.dot(np.linalg.pinv(XtX),Xt)
        self.paras = np.dot(X_cross,y)
    def y_process(self,y):
        '''
        build a multi-dimension y matrix if it's multi-class
        '''
        y = y.reshape(-1,1)
        num = y.shape[0]
        y_unique = np.unique(y).reshape(1,-1)   # to form a 1*29 numpy array
        classes = np.repeat(y_unique,num,axis=0)
        res_mat = (y==classes).astype(int)
        return res_mat
    def x_process(self,x):
        '''
        add ones
        '''
        train_sample_num = x.shape[0]
        train_attrs_num = x.shape[1]
        x = np.hstack((np.ones((train_sample_num,1)),x))    # add ones
        return x
    def predict(self,xtest):
        xtest = self.x_process(xtest)
        xtest = np.transpose(xtest)
        pre = np.dot(np.transpose(self.paras),xtest) 
        res_pre = np.argmax(pre,axis=0) 
        return res_pre

### 3.1 Training and Testing Set

In [5]:
raw_data.columns = attributes
num = raw_data.shape[0]
train_num = 4100
test_num = num - train_num

def my_train_test_split(raw_data,attr):
    data = raw_data[[attr[0],attr[1],'age']]
    xdata = np.array(data[attr])
    ydata = age_col
    #ydata = np.array(data['age'])
    xtrain = xdata[0:train_num]
    xtest = xdata[train_num:]
    ytrain = ydata[0:train_num]
    ytest = ydata[train_num:]
    return [xtrain,ytrain,xtest,ytest]

### 3.2 Sklearn Baseline

In [15]:
data = raw_data[attributes]
xdata = np.array(data[features])
ydata = age_col
xtrain = xdata[0:train_num]
xtest = xdata[train_num:]
ytrain = ydata[0:train_num]
ytest = ydata[train_num:]

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
model = SVC()
#model = DecisionTreeClassifier()
#model = LogisticRegression()
model.fit(xtrain,ytrain)
pre = model.predict(xtest)
assert pre.shape==ytest.shape
print('accuracy =',sum(pre==ytest)/test_num)

accuracy = 0.7922077922077922


### 3.3 LeastSquare 

In [14]:
features =  ['sex','length','diameter','height','wh_wt','sk_wt','vr_wt','sl_wt']
for count1 in range(len(features)-1):
    for count2 in range(count1+1,len(features)):
        attr = [features[count1],features[count2]]      
        [xtrain,ytrain,xtest,ytest] = my_train_test_split(raw_data,attr)
        ls = LeastSquare()
        ls.fit(xtrain,ytrain)      
        pre = ls.predict(xtest)
        assert pre.shape==ytest.shape
        print('accuracy with attrs chosen as %s and %s = %lf'%(attr[0],attr[1],sum(pre==ytest)/test_num))

accuracy with attrs chosen as sex and length = 0.792208
accuracy with attrs chosen as sex and diameter = 0.805195
accuracy with attrs chosen as sex and height = 0.740260
accuracy with attrs chosen as sex and wh_wt = 0.753247
accuracy with attrs chosen as sex and sk_wt = 0.753247
accuracy with attrs chosen as sex and vr_wt = 0.766234
accuracy with attrs chosen as sex and sl_wt = 0.792208
accuracy with attrs chosen as length and diameter = 0.779221
accuracy with attrs chosen as length and height = 0.753247
accuracy with attrs chosen as length and wh_wt = 0.753247
accuracy with attrs chosen as length and sk_wt = 0.792208
accuracy with attrs chosen as length and vr_wt = 0.766234
accuracy with attrs chosen as length and sl_wt = 0.766234
accuracy with attrs chosen as diameter and height = 0.766234
accuracy with attrs chosen as diameter and wh_wt = 0.766234
accuracy with attrs chosen as diameter and sk_wt = 0.779221
accuracy with attrs chosen as diameter and vr_wt = 0.779221
accuracy with att

### 3.4 LDA

In [47]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

data = raw_data[attributes]
xdata = np.array(data[features])
ydata = age_col
xtrain = xdata[0:train_num]
xtest = xdata[train_num:]
ytrain = ydata[0:train_num]
ytest = ydata[train_num:]

lda = LinearDiscriminantAnalysis(n_components=2)
lda.fit(xtrain,ytrain)
xtest_trans = lda.transform(xtest)

pre = lda.predict(xtest)

assert pre.shape==ytest.shape
print('accuracy with lda = ',sum(pre==ytest)/test_num)
print('projected data',xtest_trans)
print('explained_variance_ratio',lda.explained_variance_ratio_ )


accuracy with lda =  0.7662337662337663
projected data [[-0.11083513 -0.8562112 ]
 [ 0.99687589 -0.59792295]
 [ 1.94705638 -0.65126871]
 [ 1.30314118 -1.61880921]
 [ 0.76268405 -1.70750558]
 [ 1.23844672 -0.378183  ]
 [ 1.37026099 -2.12822668]
 [-1.45901586  0.61949359]
 [-1.27130713  0.24550984]
 [-0.6630661  -0.35321381]
 [ 1.00457926 -0.52202528]
 [-0.96121898 -0.30356993]
 [ 0.8781679   0.17888481]
 [-1.1884999  -0.9182526 ]
 [-1.30552974 -1.06486272]
 [-0.78718608 -1.51818474]
 [-0.91180376 -1.51926221]
 [ 0.07471849 -1.31813369]
 [ 1.23767096 -0.60276846]
 [-1.54440598  1.39260677]
 [-0.75198772  0.75754039]
 [-0.7143742   0.97811519]
 [-0.55024628  0.36149754]
 [-0.55129548  0.28863445]
 [-0.48832532  0.17694123]
 [-0.76630774 -0.39557609]
 [ 0.19181586  0.16074877]
 [-0.38476264  0.20985347]
 [ 0.10081652 -0.99562095]
 [-0.20889044  0.95618391]
 [ 0.33568245 -0.17856286]
 [-0.37165977  0.75379955]
 [ 0.96603331 -0.17730232]
 [ 0.23852803 -1.53830707]
 [ 0.04932186 -0.52438105]
