## Assignment 1 - LakeFS

### Installs

In [67]:
! pip install lakefs


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


### Installs

In [None]:
import pandas as pd
import numpy as np
import lakefs
from lakefs.client import Client

### Cleaning Data Helper Function

In [99]:
def clean_data(raw_data):
    clean_data = raw_data.copy()
    # Remove not relevant columns
    clean_data = clean_data.dropna(subset=['region','age','weight','height','howlong','gender','eat', \
                               'train','background','experience','schedule','howlong', \
                               'deadlift','candj','snatch','backsq','experience',\
                               'background','schedule','howlong'])
    clean_data = clean_data.drop(columns=['affiliate','team','name','athlete_id','fran','helen','grace',\
                              'filthy50','fgonebad','run400','run5k','pullups','train'])

    # Remove Outliers

    clean_data = clean_data[clean_data['weight'] < 1500]
    clean_data = clean_data[clean_data['gender'] != '--']
    clean_data = clean_data[clean_data['age'] >= 18]
    clean_data = clean_data[(clean_data['height'] < 96) & (clean_data['height'] > 48)]

    clean_data = clean_data[(clean_data['deadlift'] > 0) & (clean_data['deadlift'] <= 1105)|((clean_data['gender'] == 'Female') \
                 & (clean_data['deadlift'] <= 636))]
    clean_data = clean_data[(clean_data['candj'] > 0) & (clean_data['candj'] <= 395)]
    clean_data = clean_data[(clean_data['snatch'] > 0) & (clean_data['snatch'] <= 496)]
    clean_data = clean_data[(clean_data['backsq'] > 0) & (clean_data['backsq'] <= 1069)]

    # Clean Survey Data

    decline_dict = {'Decline to answer|': np.nan}
    clean_data = clean_data.replace(decline_dict)
    clean_data = clean_data.dropna(subset=['background','experience','schedule','howlong','eat'])
    return clean_data

### Connecting LakeFS Repository and Branch

In [None]:
clt = Client(username="AKIAIOSFOLQUICKSTART", password="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", host="http://localhost:8000")
repo = lakefs.Repository(repository_id="assignment1", client=clt)
branch = repo.branch('main')


Branch(repository="assignment1", id="main")


### Writing Raw Data to LakeFS

In [None]:
obj = branch.object(path="data_versions/athletes-v1.csv")

# Open local CSV file
with open("athletes.csv", "rb") as local_file:
    # Open remote writer
    with obj.writer(mode='w', content_type="text/csv") as fd:
        # Write the entire file in binary
        fd.write(local_file.read())


### EDA + Feature Engineering - Raw Data

In [None]:
# Read the correct version from LakeFS to perform EDA
obj = branch.object(path="data_versions/athletes-v1.csv")
read_data = pd.read_csv(obj.reader(mode='r'))
read_data.to_csv('athletes.csv')

#This script re-writes the data to our local file system
!python eda.py

Shape: (423006, 29)
Columns: ['Unnamed: 0', 'athlete_id', 'name', 'region', 'team', 'affiliate', 'gender', 'age', 'height', 'weight', 'fran', 'helen', 'grace', 'filthy50', 'fgonebad', 'run400', 'run5k', 'candj', 'snatch', 'deadlift', 'backsq', 'pullups', 'eat', 'train', 'background', 'experience', 'schedule', 'howlong', 'total_lift']
Data types:
 Unnamed: 0      int64
athlete_id    float64
name           object
region         object
team           object
affiliate      object
gender         object
age           float64
height        float64
weight        float64
fran          float64
helen         float64
grace         float64
filthy50      float64
fgonebad      float64
run400        float64
run5k         float64
candj         float64
snatch        float64
deadlift      float64
backsq        float64
pullups       float64
eat            object
train          object
background     object
experience     object
schedule       object
howlong        object
total_lift    float64
dtype: object

### Writing Feature Engineered Data to LakeFS - Raw Data

In [78]:
obj = branch.object(path="data_versions/athletes-feature-v1.csv")

# Open local CSV file
with open("athletes.csv", "rb") as local_file:
    # Open remote writer
    with obj.writer(mode='w', content_type="text/csv") as fd:
        # Write the entire file in binary
        fd.write(local_file.read())

### Writing Clean Data to LakeFS

In [None]:
orig_data = pd.read_csv('athletes.csv')
clean_data = clean_data(orig_data)
clean_data.to_csv('athletes.csv')

obj = branch.object(path="data_versions/athletes-v2.csv")

# Open local CSV file
with open("athletes.csv", "rb") as local_file:
    # Open remote writer
    with obj.writer(mode='w', content_type="text/csv") as fd:
        # Write the entire file in binary
        fd.write(local_file.read())

### EDA + Feature Engineering - Clean Data

In [101]:
# Read the correct version from LakeFS to perform EDA
obj = branch.object(path="data_versions/athletes-v2.csv")
read_data = pd.read_csv(obj.reader(mode='r'))
read_data.to_csv('athletes.csv')

!python eda.py

Shape: (30029, 16)
Columns: ['Unnamed: 0', 'region', 'gender', 'age', 'height', 'weight', 'candj', 'snatch', 'deadlift', 'backsq', 'eat', 'background', 'experience', 'schedule', 'howlong', 'total_lift']
Data types:
 Unnamed: 0      int64
region         object
gender         object
age           float64
height        float64
weight        float64
candj         float64
snatch        float64
deadlift      float64
backsq        float64
eat            object
background     object
experience     object
schedule       object
howlong        object
total_lift    float64
dtype: object

First 5 rows:
   Unnamed: 0               region  ...     howlong  total_lift
0          21  Southern California  ...  1-2 years|      1110.0
1          22               Africa  ...  2-4 years|       910.0
2          27           North East  ...  2-4 years|      1335.0
3          50        North Central  ...  1-2 years|      1354.0
4          60           North East  ...   4+ years|      1225.0

[5 rows x 16 colum

### Writing Feature Engineered Data to LakeFS - Clean Data

In [83]:
obj = branch.object(path="data_versions/athletes-feature-v2.csv")

# Open local CSV file
with open("athletes.csv", "rb") as local_file:
    # Open remote writer
    with obj.writer(mode='w', content_type="text/csv") as fd:
        # Write the entire file in binary
        fd.write(local_file.read())

### Model Creation - Raw Data

In [106]:
# Read the correct version from LakeFS to perform EDA
obj = branch.object(path="data_versions/athletes-feature-v1.csv")
read_data = pd.read_csv(obj.reader(mode='r'))
read_data.to_csv('athletes.csv')

!python create_model.py

READY TO GO
Filled missing values and encoded categoricals
Train/test split completed
(85191, 5) (85191,)
Model training completed
R^2 score: 0.000
RMSE: 181820.552


### Model Creation - Clean Data

In [None]:
# Read the correct version from LakeFS to perform EDA
obj = branch.object(path="data_versions/athletes-feature-v2.csv")
read_data = pd.read_csv(obj.reader(mode='r'))
read_data.to_csv('athletes.csv')

!python create_model.py

## Github LFS

In [None]:
! git lfs install
! git lfs track "*.csv"
! git add .gitattributes
! git commit -m "Track CSV files with Git LFS"

### Raw Data Versioning/ Tagging

In [None]:
! git add athletes.csv
! git commit -m "Add raw dataset (v1)"
! git tag v1

7d34b8cb53 - assignment1/athletes.csv


### Clean Data Versioning/ Tagging

In [None]:
clean_data = clean_data(orig_data)
clean_data.to_csv('athletes.csv')

! git add athletes.csv
! git commit -m "Add clean dataset (v2)"
! git tag v2

### Pushing All Changes/ Tags

In [None]:
! git push
! git push origin --tags
! git lfs push origin main

### EDA + Feature Engineering - Raw Data

In [None]:
! git checkout v1 -- athletes.csv
! python eda.py
! git add data.csv
! git commit -m "Add total_lift to raw dataset based on v1"
! git tag v1-feature
! git push
! git push origin --tags


### EDA + Feature Engineering - Clean Data

In [None]:
! git checkout v2 -- athletes.csv
! python eda.py
! git add athletes.csv
! git commit -m "Add total_lift to cleaned dataset based on v2"
! git tag -f v2-feature
! git push
! git push origin --tags

### Model Creation - Raw Data

In [114]:
! git checkout v1-feature athletes.csv 
! python create_model.py

Updated 1 path from 12650c0
READY TO GO
Filled missing values and encoded categoricals
Train/test split completed
(85191, 5) (85191,)
Model training completed
R^2 score: 0.000
RMSE: 181821.268


### Model Creation - Clean Data

In [115]:
! git checkout v2-feature athletes.csv 
! python create_model.py

Updated 1 path from 76bfe0c
READY TO GO
Filled missing values and encoded categoricals
Train/test split completed
(30029, 4) (30029,)
Model training completed
R^2 score: 0.621
RMSE: 170.563
