In [6]:
import datetime as dt
import scipy
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = [20.0, 8.0] # Default: [6.0, 4.0]

In [82]:
df = pd.read_csv('../data/csv/oving5/0141126194/Card.csv', index_col=0, parse_dates=True)

In [83]:
df.columns

       'FailureCount1', 'ErrorCount1', 'Completion1', 'JunitTest2',
       'SuccessCount2', 'FailureCount2', 'ErrorCount2', 'Completion2',
       'Launch2'],
      dtype='object')

In [85]:
df.Relative_time

2017-02-19 10:14:27   00:00:00
2017-02-19 10:15:43   00:01:16
2017-02-19 10:25:34   00:11:07
2017-02-19 10:25:55   00:11:28
2017-02-19 10:26:45   00:12:18
2017-02-19 10:26:55   00:12:28
2017-02-19 10:28:18   00:13:51
2017-02-19 10:28:19   00:13:52
2017-02-19 10:31:59   00:17:32
2017-02-19 10:32:41   00:18:14
2017-02-19 10:33:21   00:18:54
2017-02-19 10:33:42   00:19:15
2017-02-19 10:34:18   00:19:51
2017-02-19 10:35:07   00:20:40
2017-02-19 10:35:28   00:21:01
2017-02-19 10:37:09   00:22:42
2017-02-19 10:38:32   00:24:05
2017-02-19 10:38:56   00:24:29
2017-02-19 10:39:42   00:25:15
2017-02-19 10:40:15   00:25:48
2017-02-19 10:40:16   00:25:49
2017-02-19 10:40:55   00:26:28
2017-02-19 10:41:45   00:27:18
2017-02-19 10:42:49   00:28:22
2017-02-19 10:43:36   00:29:09
2017-02-19 10:44:17   00:29:50
2017-02-19 10:44:23   00:29:56
2017-02-19 10:45:33   00:31:06
2017-02-19 10:46:27   00:32:00
2017-02-19 10:47:29   00:33:02
                        ...   
2017-02-20 16:20:15   04:41:00
2017-02-

## Creating relative time column

In [84]:
df['Relative_time'] = dt.timedelta(seconds=0)

for i in range(1, len(df)):
    diff = df.index[i] - df.index[i-1]
    if diff.seconds > 600:
        diff = dt.timedelta(seconds=600)
    accumulated = df.Relative_time.iloc[i-1] + diff
    df.Relative_time.iat[i] = accumulated

In [39]:
df

Unnamed: 0,SourceEdit1,SizeMeasure1,WarningCount1,SourceEdit2,SizeMeasure2,WarningCount2,JunitTest1,SuccessCount1,FailureCount1,ErrorCount1,...,Completion2,JunitTest3,SuccessCount3,FailureCount3,ErrorCount3,Completion3,Launch1,TotalSizeMeasure,TotalCompletion,Relative_time
2017-02-23 22:56:38,1.0,32.0,,,,,,0.0,,,...,,,0.0,,,,,32.0,,0.0
2017-02-23 22:56:38,,32.0,,1.0,73.0,,,0.0,,,...,,,0.0,,,,,105.0,,
2017-02-23 22:57:24,,32.0,,,73.0,,1.0,2.0,,,...,,,0.0,,,,,105.0,,
2017-02-23 22:59:26,,32.0,,,73.0,,,2.0,,,...,,,0.0,,,,,110.0,,
2017-02-23 22:59:29,,32.0,,1.0,77.0,,,2.0,,,...,,,0.0,,,,,114.0,,
2017-02-23 23:00:53,,32.0,,,77.0,,,2.0,,,...,,,0.0,,,,,118.0,,
2017-02-23 23:02:30,,32.0,,,77.0,,,2.0,,,...,,,0.0,,,,,122.0,,
2017-02-23 23:03:17,,32.0,,,77.0,,,2.0,,,...,,,0.0,,,,,126.0,,
2017-02-23 23:13:12,,32.0,,1.0,80.0,,,2.0,,,...,,,0.0,,,,,129.0,,
2017-02-23 23:13:40,,32.0,,,80.0,,,2.0,,,...,0.666667,,0.0,,,,,129.0,,


## Forward-filling and aggregating data

In [27]:
df.update(df.filter(regex='^SizeMeasure\d{1,}$').ffill())  # Forward-filling all columns beginning with 'SizeMeasure'
df['TotalSizeMeasure'] = df.filter(regex='^SizeMeasure\d{1,}$').sum(axis=1)
df.update(df.filter(regex='^WarningCount\d{1,}$').ffill())  # Forward-filling all columns beginning with 'WarningCount'
df.update(df.filter(regex='^Completion\d{1,}$').ffill())  # Forward-filling all columns beginning with 'Completion'
df['TotalCompletion'] = df.filter(regex='^Completion\d{1,}$').mean(skipna=False, axis=1)
df.update(df.filter(regex='^SuccessCount\d{1,}$').ffill())  # Forward-filling all columns beginning with 'SuccessCount'
df.update(df.filter(regex='^SuccessCount\d{1,}$').fillna(0))  # Filling all NaNs  with 0 for columns beginning with 'SuccessCount'
df.update(df.filter(regex='^FailureCount\d{1,}$').ffill())  # Forward-filling all columns beginning with 'FailureCount'
df.update(df.filter(regex='^ErrorCount\d{1,}$').ffill())  # Forward-filling all columns beginning with 'ErrorCount'
#df['SizeMeasure'].bfill(inplace=True)
#df['Completion'].ffill(inplace=True)
#df['Completion'].bfill(inplace=True)

## Importing the grades into a dictionary where the keys are username hashes

In [None]:
grades = OrderedDict()
with open('../brukernummer-karakter.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile, fieldnames=['hash', 'grade'])
    for row in reader:
        grades[row['hash']] = row['grade']

In [None]:
grades

## Creating features from the raw data

In [None]:
def get_features(df, hash_id):
    if 'SizeMeasure' not in df.columns:
        return None
    df.update(df.filter(regex='^SizeMeasure\d{1,}$').ffill())  # Forward-filling
    df.WarningCount.ffill(inplace=True)  # 
    df.Completion.ffill(inplace=True)
    df.SuccessCount.ffill(inplace=True)
    df.FailureCount.ffill(inplace=True)
    df.ErrorCount.ffill(inplace=True)
    f_df = pd.DataFrame(index=[hash_id])
    f_df['MaxSizeMeasure'] = df.SizeMeasure.max()
    f_df['FinalSizeMeasure'] = df.iloc[-1].SizeMeasure  # Works because SizeMeasure is forward-filled!
    f_df['JunitRuns'] = df.JunitTest.count()
    if f_df.JunitRuns.any() == 0:
        return None
    #if df.Completion.last_valid_index() is None:
        #return None
    #f_df['FinalCompletion'] = df.loc[df.Completion.last_valid_index()].Completion
    f_df['SizeMeasure_div_JunitRuns'] = (f_df.FinalSizeMeasure / f_df.JunitRuns)[-1]
    f_df['Avg_SizeMeasure_div_Completion'] = (df.SizeMeasure * df.Completion).mean()
    f_df['MinutesSpent'] = ((df.index[-1].value - df.index[0].value) // 10**9) / 60
    if hash_id not in grades:
        return None
    f_df['Grade'] = grades[hash_id]
    return f_df

## Training classifier

In [None]:
training_input = df_partner.iloc[:132].copy()
training_output = training_input[['Grade']].copy().squeeze()
training_input.drop(['Grade'], axis=1, inplace=True)

validation_input = df_partner.iloc[132:].copy()
validation_output = validation_input[['Grade']].copy().squeeze()
validation_input.drop(['Grade'], axis=1, inplace=True)

In [None]:
input_scaler = MinMaxScaler()
training_input_scaled = input_scaler.fit(training_input).transform(training_input)
validation_input_scaled = input_scaler.fit(validation_input).transform(validation_input)

output_scaler = MinMaxScaler()

In [None]:
classifier = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1, max_depth=5, verbose=1)
classifier.fit(training_input, training_output)

### Predicting using the classifier

In [None]:
classifier.predict(validation_input)

In [None]:
validation_output

In [None]:
plt.scatter(df_partner.MaxSizeMeasure, df_partner.SizeMeasure_div_JunitRuns)