In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn import cross_validation
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as stats
from itertools import product

In [None]:
original_df = pd.read_csv('/root/data/small_pen_data_collection/dataset_13k_pairs.csv')
# original_df = original_df.filter(original_df.columns.tolist()[1:-1]).dropna()
original_df = original_df.filter(original_df.columns.tolist()[1:-1])
original_df['fish_id'] = 1
fish_counter = 1
for biomass in original_df.ground_truth.unique():
    mask = original_df.ground_truth == biomass
    original_df.ix[mask, 'fish_id'] = fish_counter
    fish_counter += 1

In [None]:
plt.hist(original_df.ix[original_df.fish_id == 2, '34'].dropna(), bins=50)

<h2> Define all possible features we care about </h2>

In [None]:
def add_convolution_features(df, primary_features):
    convolution_features = []
    for i in range(len(primary_features)):
        for j in range(i, len(primary_features)):
            pair_1 = primary_features[i]
            pair_2 = primary_features[j]
            conv_feature = '{},{}'.format(pair_1, pair_2)
            df[conv_feature] = df[pair_1] * df[pair_2]
            convolution_features.append(conv_feature)
    return convolution_features

def add_square_features(df, primary_features):
    square_features = []
    for feature in primary_features:
        square_feature = '{},{}'.format(feature, feature)
        df[square_feature] = df[feature]**2
        square_features.append(square_feature)
    return square_features

def add_cubic_features(df, primary_features):
    cubic_features = []
    for feature in primary_features:
        cubic_feature = '{},{},{}'.format(feature, feature, feature)
        df[cubic_feature] = df[feature]**3
        cubic_features.append(cubic_feature)
    return cubic_features



In [None]:
# get features list

df = original_df.copy(deep=True)
primary_features = df.columns.tolist()[:-2]
# square_features = add_square_features(df, primary_features)
# cubic_features = add_cubic_features(df, primary_features)


<h1> Forward Stepwise Selection using AIC score as main criterion for evaluating model </h1>

In [None]:
# features = primary_features + square_features + cubic_features
features = primary_features
target = 'ground_truth'
feature_subset = []
rank = 1
while True:
    aic_dict = {}
    for feature in features:
        if feature in feature_subset:
            continue
        X_train = df[feature_subset + [feature]]
        y_train = df[target]
        model = sm.OLS(y_train, sm.add_constant(X_train)).fit()
        aic_dict[feature] = model.aic
    best_feature = min(aic_dict, key=aic_dict.get)
    print('Feature ranked #{}: {}'.format(rank, best_feature))
    feature_subset.append(best_feature)        
    rank += 1
    if rank > 10:
        break
        

<h3> We will probably end up using only '46', '26', and '24' and variants of those features so that we can restrict ourselves to only having to detect three body parts </h3>

<h1> Run cross validation </h1>

In [None]:
def get_train_test_split(df, features, target='ground_truth', test_fraction=0.2, random_seed=None):
    if random_seed is not None:
        np.random.seed(random_seed)
    N = df.shape[0]
    shuffled_fish_ids = df.fish_id.unique()
    np.random.shuffle(shuffled_fish_ids)
    tdfs = []
    train_sample_size = 0
    for i, fish_id in enumerate(shuffled_fish_ids):
        mask = df.fish_id == fish_id
        tdf = df[mask].copy(deep=True)
        tdfs.append(tdf)
        if train_sample_size < (1 - test_fraction) * N:
            train_sample_size += tdf.shape[0]
    shuffled_df = pd.concat(tdfs, axis=0)
    shuffled_df.index = range(shuffled_df.shape[0])
    shuffled_df = shuffled_df[features + [target]].copy(deep=True)
    train_df = shuffled_df.iloc[:train_sample_size].copy(deep=True)
    test_df = shuffled_df.iloc[train_sample_size:].copy(deep=True)
    
    
    X_train = train_df[features]
    Y_train = train_df[target]
    X_test = test_df[features]
    Y_test = test_df[target]
    
    return X_train, Y_train, X_test, Y_test
    

In [None]:
def perform_cross_validation(df, features, N=10, test_fraction=0.2):
    mae_list = []
    mse_list = []
    error_means = []
    for i in range(N):
        X_train, Y_train, X_test, Y_test = get_train_test_split(df, features, test_fraction=test_fraction)
        model = sm.OLS(Y_train, sm.add_constant(X_train)).fit()
        
        predictions = model.predict(sm.add_constant(X_test))
        errors = (predictions - Y_test)/Y_test
        
        absolute_errors = np.abs(errors)
        mae = absolute_errors.mean()
        squared_errors = errors ** 2
        mse = squared_errors.mean()
        mae_list.append(mae)
        mse_list.append(mse)
        error_means.append(np.mean(predictions - Y_test)/np.mean(Y_test))
    return mse_list, mae_list, error_means
        

In [None]:
features = ['24', '24,24', '24,24,24', '46', '46,46', '46,46,46', '26', '26,26', '26,26,26']
mse_list, mae_list, error_means = perform_cross_validation(df, features)
print('Average percentage deviation from true mean: {}'.format(np.mean(np.abs(error_means))))

<h1> Train final model and save it </h1>

In [None]:
X_train = df[features]
y_train = df[target]
model = sm.OLS(y_train, sm.add_constant(X_train)).fit()
model.save('/root/alok/data/models/filtered_feature_model.pkl')


<h1> Bryton's stuff below </h1>

In [None]:
y_train = np.array(y_train)
X_train = np.squeeze(X_train)


model = sm.OLS(y_train, sm.add_constant(X_train)).fit()
print(model.summary())

In [None]:
np.corrcoef(y_train, X_train)

In [None]:
plt.scatter(X_train, y_train)
plt.show()

In [None]:
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(X_train, y_train)

In [None]:
import scipy

In [None]:
print(df.shape)
#print(df.isna().sum())

new_df = df.drop(columns=[])
#new_df = df.drop(columns=['14', '24', '34', '45', '46', '47']) 
#new_df = new_df.drop(columns=['16', '26', '36', '56', '67']) # '46', 

print(new_df.isna().sum())

new_df = new_df.dropna(subset=new_df.columns[1:-2])

print(new_df.shape)


my_mean = new_df.mean()
#my_sd = np.std(new_df, axis=0)

#print(my_mean)

means = []
stds = []

my_columns = new_df.columns[1:-2]

print(my_columns)

norm_df = new_df.copy()

for x in my_columns:
    my_mean = new_df[x].median()
    my_std = new_df[x].std()
    my_iqr = stats.iqr(new_df[x])
    
    means.append(my_mean)
    stds.append(my_std)
    
    #print(my_mean)
    #print(my_std)
    #print((new_df[x] - my_mean) / my_std)
    
    my_row = new_df[x].copy()
    new_df[x] = my_row / my_iqr
    norm_df[x] = (my_row - my_mean) / my_std
    
#norm_df

In [None]:
df.iloc[:, 1:-2].shape

In [None]:
my_df = new_df.loc[:, new_df.columns[1]:new_df.columns[-3]]
my_norm_df = norm_df.loc[:, new_df.columns[1]:new_df.columns[-3]]

array_subset = (np.abs(my_norm_df) > 1.5).any(axis=1) == False

print('Keeping %i of %i' % (np.sum(array_subset), my_norm_df.shape[0]))

my_df = my_df.loc[array_subset, :]

#Y = new_df['ground_truth']
Y = new_df['ground_truth'][array_subset]
print(my_df.shape)
#my_df_X = np.hstack((my_df, my_df ** 2, my_df ** 3))
pidx = np.indices((my_df.shape[1], my_df.shape[1])).reshape(2, -1)
lcol = pd.MultiIndex.from_product([my_df.columns, my_df.columns],
                                  names=[my_df.columns.name, my_df.columns.name])
my_df_X = pd.DataFrame(my_df.values[:, pidx[0]] * my_df.values[:, pidx[1]],
             columns=lcol)
print(my_df_X.shape)
#my_df_X = np.hstack(( my_df_X))

# my_df_X = sm.add_constant(my_df_X)
print(my_df_X.shape)              

In [None]:
pidx.shape

In [None]:
lcol.shape

In [None]:
my_df_X.shape

In [None]:
pca = PCA(n_components=2)

pca.fit(my_df_X)
newX = pca.transform(my_df_X)
# new_X_compare = pca.fit_transform(my_df_X)

In [None]:
newX

In [None]:
plt.scatter(newX[:, 0], newX[:, 1])
plt.show()

In [None]:
eigenvalues = pca.explained_variance_ratio_
print(np.sum(eigenvalues))

In [None]:
components = pca.components_

newX = np.dot(my_df_X, components.T)

print(components.shape)

newX

#outlierIndices = np.where(newX[:,0] > 10)
#my_indices = my_df.index[outlierIndices[0]]


#my_df.loc[my_indices,:]

In [None]:
newX.shape

In [None]:
plt.scatter(newX[:,0], Y)#plt.scatter(np.log(newX[:,0]),np.log(Y))
plt.show()

In [None]:
#myX = newX[:, 0]
myX = newX
myY = Y

plt.scatter(newX[:, 0], Y)
plt.show()

# myX = sm.add_constant(myX)

print(myX.shape)
print(myY.shape)

model = sm.OLS(myY, myX).fit()
predictions = model.predict(myX) # make the predictions by the model

print(model.params.shape)
#print(model.summary())

predY = predictions
#predY = np.exp(predictions)

#model.summary()

error = predY - myY

plt.scatter(Y, predY)

res = model.resid
fig = sm.qqplot(res, fit=True, line='45')

plt.show()

print('Error: %0.2f' % (np.median(np.abs(error)), ))
print('Pct Error: %0.2f' % (np.median(np.abs(error) / myY * 100), ))


In [None]:
model.save('/root/data/models/biomass/model.pickle')

In [None]:
np.save('/root/data/models/biomass/components.npy', components)

In [None]:
components.shape

In [None]:
from collections import Counter

In [None]:
Counter(list(Y)).most_common()

In [None]:
def cross_validation_per_fish(dataframe, Y, test_size = 0.2):
    nsamples = len(list(Y))
    most_common_list = Counter(list(Y)).most_common()

    np.random.shuffle(most_common_list)
    
    train_fish_ids = []
    test_fish_ids = []
    train_counter = 0
    for mc in most_common_list:
        train_counter += mc[1]
        if train_counter < nsamples * (1 - test_size):
            train_fish_ids.append(mc[0])
        else:
            test_fish_ids.append(mc[0])
    # create the datasets
    train_index = []
    test_index = []
    for (i, v) in enumerate(Y):
        if v in train_fish_ids:
            train_index.append(i)
        else:
            test_index.append(i)
#     print(train_index)
#     print(test_index)
    X_train = dataframe.iloc[train_index, :]
    Y_train = Y.iloc[train_index]
    X_test = dataframe.iloc[test_index, :]
    Y_test = Y.iloc[test_index]
    return X_train, X_test, Y_train, Y_test

In [None]:
X_train, X_test, Y_train, Y_test = cross_validation_per_fish(my_df_X, Y)

In [None]:
def getError(n_components, k):
    errors = []
    avg_errors = []
    avg_errors_raw = []
    error_pcts = []
    
    for i in range(k):
#         X_train, X_test, y_train, y_test = cross_validation.train_test_split(my_df_X, Y, test_size=0.4)
        X_train, X_test, y_train, y_test = cross_validation_per_fish(my_df_X, Y)
        
        pca = PCA(n_components=n_components)
        
        pca.fit(X_train)
        
#         components = pca.components_ ### put this back in!
        components = np.eye(X_train.shape[1])[:n_components,:]
        newX = np.dot(X_train, components.T)

        #newX = pca.fit_transform(X_train)
        model = sm.OLS(y_train, newX).fit()
        
        newX_test = np.dot(X_test, components.T)
        #X_test = pca.transform(X_test)
        
        predY = model.predict(newX_test)
        error = predY - y_test

        errors.append(np.median(np.abs(error)))
        avg_errors.append(np.abs(np.mean(error)) / np.mean(y_test) * 100)
        avg_errors_raw.append(np.mean(error) / np.mean(y_test) * 100)
        error_pcts.append(np.median(np.abs(error) / y_test * 100))
    
    return (np.mean(errors), np.mean(avg_errors), avg_errors_raw, np.mean(error_pcts))

In [None]:
# def getError(n_components, k):
#     errors = []
#     avg_errors = []
#     avg_errors_raw = []
#     error_pcts = []
    
#     for i in range(k):
#         pca = PCA(n_components=n_components)

#         pca.fit(my_df_X)

#         components = pca.components_

#         newX = np.dot(my_df_X, components.T)

#         X_train, X_test, y_train, y_test = cross_validation.train_test_split(newX, Y, test_size=0)

#         myX = X_train
#         myY = y_train
#         model = sm.OLS(myY, myX).fit()
        
#         predY = model.predict(X_test) # make the predictions by the model

#         error = predY - y_test

#         errors.append(np.median(np.abs(error)))
#         avg_errors.append(np.abs(np.mean(error)) / np.mean(y_test) * 100)
#         avg_errors_raw.append(np.mean(error) / np.mean(y_test) * 100)
#         error_pcts.append(np.median(np.abs(error) / y_test * 100))
    
#     return (np.mean(errors), np.mean(avg_errors), avg_errors_raw, np.mean(error_pcts))

In [None]:
#n_eigens = [5, 10, 15, 20, 25, 30, 35, 40, 50, 60, 70, 80, 90, 100]
n_eigens = [1, 2, 3, 5, 10, 15]
# n_eigens = [1]

errors = []
avg_errors_raws = []

for n_eigen in n_eigens:
    myError = getError(n_eigen, 50)
    errors.append(myError[3])
    avg_errors_raws.append(myError[2])
    
    #print(myError)

    print('Achieve %0.2f with %i eigenvectors' % (myError[1], n_eigen))
    
#print(avg_errors_raws[1])
plt.plot(avg_errors_raws[3])
plt.show()

print(.4 * newX.shape[0])

# plt.plot(n_eigens, errors)
# plt.xlabel('Number of regressors')
# plt.ylabel('Median error %')
# plt.show()


In [None]:
'''
Extensions to this
- Try different filtering based off of total norm of covariance of eigenvectors
- More data
'''

In [None]:
np.corrcoef(np.squeeze(newX), np.array(y_train))

In [None]:
predY, y_test = getError(1, 50)

In [None]:
sm.OLS(np.squeeze(newX), np.array(y_train)).fit().summary()

In [None]:
predY

In [None]:
newX_test

In [None]:
from scipy import stats

In [None]:
stats.linregress(np.squeeze(newX), np.array(y_train))

In [None]:
np.abs((np.array(y_train) - 8 * np.squeeze(newX))).mean()

In [None]:
plt.scatter(newX, y_train)
plt.show()

In [None]:
sm.OLS(np.array(y_train), np.squeeze(newX)).fit().summary()

In [None]:
from scipy.stats import pearsonr

In [None]:
pearsonr(np.array(y_train), np.squeeze(newX), )

In [None]:
import random
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:

np.histogram(np.random.normal(loc=3000, scale=500, size=10000))

<h1> Test PCA Model </h1>

In [None]:
import pickle


In [None]:
# Read in model, PCA components, and new dataframe

model = sm.load('/root/data/models/biomass/model.pickle')
components = np.load('/root/data/models/biomass/components.npy')
iqrs = pickle.load(open('/root/data/models/biomass/iqrs.pkl', 'rb'))
df = pd.read_csv('/root/data/small_pen_data_collection/dataset_13k_pairs.csv')

In [None]:
new_df = pd.DataFrame()
new_df['ground_truth'] = df['ground_truth']
feature_remapping = {
    '2': '1',
    '3': '2',
    '4': '3',
    '5': '4',
    '6': '5',
    '7': '6',
    '8': '7'
}

for column in df.columns.tolist():
    part_1, part_2 = column[0], column[1]
    if part_1 not in feature_remapping.keys() or part_2 not in feature_remapping.keys():
        continue
    remapped_part_1 = feature_remapping[part_1]
    remapped_part_2 = feature_remapping[part_2]
    f = '{}{}'.format(remapped_part_1, remapped_part_2)
    new_df[f] = df[column] / iqrs[f]
df = new_df.copy(deep=True)
df = df.dropna()

In [None]:
df_X = df[df.columns.tolist()[1:]]

pidx = np.indices((df_X.shape[1], df_X.shape[1])).reshape(2, -1)
lcol = pd.MultiIndex.from_product([df_X.columns, df_X.columns],  names=[df_X.columns.name, df_X.columns.name])
X = pd.DataFrame(df_X.values[:, pidx[0]] * df_X.values[:, pidx[1]],  columns=lcol)

newX = np.dot(X, components.T)
predY = model.predict(newX)
y = df.ground_truth.values

In [None]:
np.mean(y - predY) / np.mean(predY)

<h1> Test Feature Filtered Model </h1>

In [None]:
# Read in model, PCA components, and new dataframe

model = sm.load('/root/alok/data/models/filtered_feature_model.pkl')
df = pd.read_csv('/root/data/small_pen_data_collection/dataset_13k_pairs.csv')

In [None]:
new_df = pd.DataFrame()
new_df['ground_truth'] = df['ground_truth']
feature_remapping = {
    '2': '1',
    '3': '2',
    '4': '3',
    '5': '4',
    '6': '5',
    '7': '6',
    '8': '7'
}

for column in df.columns.tolist():
    part_1, part_2 = column[0], column[1]
    if part_1 not in feature_remapping.keys() or part_2 not in feature_remapping.keys():
        continue
    remapped_part_1 = feature_remapping[part_1]
    remapped_part_2 = feature_remapping[part_2]
    f = '{}{}'.format(remapped_part_1, remapped_part_2)
    new_df[f] = df[column]
df = new_df.copy(deep=True)
df = df.dropna()

In [None]:
df_X = pd.DataFrame()
for feature in ['24', '46', '26']:
    df_X[feature] = df[feature]
    df_X['{},{}'.format(feature, feature)] = df[feature]**2 # add square features
    df_X['{},{},{}'.format(feature, feature, feature)] = df[feature]**3 # add cubic features
    
y = df.ground_truth



In [None]:
predY = model.predict(sm.add_constant(df_X))

In [None]:
model.params