In [0]:
%%bash
if [ ! -f "random-forest-apps.parquet" ]; then
  wget -q https://github.com/hmatalonga/farmer-showcase/raw/master/data/random-forest-apps.parquet
fi

In [0]:
import numpy as np
import pandas as pd
import pyarrow.parquet as pq

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

In [0]:
%load_ext google.colab.data_table

In [0]:
def load_df(path, columns=None, use_threads=True, strings_to_categorical=True):
  """
  Load a parquet file and returns a pandas DataFrame
  """
  try:
    table = pq.read_table(path, columns=columns, use_threads=use_threads)
    return table.to_pandas(strings_to_categorical=strings_to_categorical)
  except Exception as e:
      print(e)


def save_df(df, path, compression='snappy', use_dictionary=True):
  """
  Save a pandas DataFrame to a parquet file
  """
  try:
    df.to_parquet(path, compression=compression,
                  use_dictionary=use_dictionary)
  except Exception as e:
    print(e)


def truncate(n, decimals=0):
    multiplier = 10 ** decimals
    return int(n * multiplier) / multiplier

In [0]:
df = load_df('random-forest-apps.parquet')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 397008 entries, 0 to 397007
Data columns (total 20 columns):
name                   397008 non-null category
application_label      397008 non-null category
importance             397008 non-null category
version_code           397008 non-null int64
battery_state          397008 non-null category
battery_level          397008 non-null uint8
network_status         397008 non-null category
screen_brightness      397008 non-null int64
screen_on              397008 non-null uint8
charger                397008 non-null category
current_average        397008 non-null int64
health                 397008 non-null category
temperature            397008 non-null float32
usage                  397008 non-null float32
bluetooth_enabled      397008 non-null uint8
location_enabled       397008 non-null uint8
power_saver_enabled    397008 non-null uint8
flashlight_enabled     397008 non-null uint8
nfc_enabled            397008 non-null uint8
voltage  

In [0]:
df = df.drop(['application_label'], axis=1)

#### Move voltage to first position

In [0]:
columns = ['voltage',
           'name',
          'importance',
          'version_code',
          'battery_state',
          'battery_level',
          'network_status',
          'screen_brightness',
          'screen_on',
          'charger',
          'current_average',
          'health',
          'temperature',
          'usage',
          'bluetooth_enabled',
          'location_enabled',
          'power_saver_enabled',
          'flashlight_enabled',
          'nfc_enabled']

df = df[columns]

#### Normalize unit values

In [0]:
df['usage'] = df['usage'].apply(lambda x: truncate(x, 2))
df['voltage'] = df['voltage'].apply(lambda x: x / 1000 if x > 1000 else x)
df['voltage'] = df['voltage'].apply(lambda x: truncate(x, 2))
df['temperature'] = df['temperature'].apply(lambda x: truncate(x, 2))

#### Label encoding for apps names 

In [0]:
df['name'] = df['name'].cat.codes

#### One-Hot encoding for categorical values

In [0]:
df = pd.get_dummies(df, columns=['importance', 'battery_state',
                                 'network_status', 'charger', 'health'])

In [0]:
X = df.iloc[:, 1:40].values
y = df.iloc[:, 0].values

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [0]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [0]:
regressor = RandomForestRegressor(n_estimators=200, random_state=0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

In [16]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.0009103798234690831
Mean Squared Error: 5.117955021145149e-05
Root Mean Squared Error: 0.007153988412868132
