# Classification using linear models

In [23]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn import linear_model
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# Annual power consumption data

## Case description / business understanding 

Electric utility companies do not know much about their customers. The information that is available for all customers is their consumption and address. In addition, some utility companies use web-portals to engage a part of their users and collect some additional information.


Some example questions the utility companies want to answer with the help of this dat* a:
Who are the customers using the efficiency port* al?
Can we learn from the information provided on the portal? Can we predict this information for other custom* ers?
What are the atypical cust

We use a dataset that consists of two combined parts1. Yyearly electricity consumptio (address and consumption in kW)
2. D data collected via an efficiency web port l(detailed household information and activity on the porta

omers?


In [56]:
apc = pd.read_csv('data/APC-dataset-anonym.csv', sep=';')
apc.head(5)
#apc.shape

Unnamed: 0,ID,PLZ,Strasse,Betreff,Cons_2011,Days_2011,Cons_2012,Days_2012,Cons_2013,Days_2013,...,Portal,pPoints,pEarnedPoints,pHouseholdType,pMainHeatingType,pWaterHeatingType,pLivingAreaM2,pHouseholdMembers,pCreated,pLastVisited
0,1,3604.0,Adlerstrasse,,3460.0,364.0,3116.0,365.0,3398.0,365.0,...,0.0,,,,,,0.0,0.0,,
1,2,8200.0,Amselweg,Whg 38 Strom,3028.0,364.0,4539.0,456.0,0.0,0.0,...,0.0,,,,,,0.0,0.0,,
2,3,8200.0,Amselweg,Whg 39 Strom,1777.0,364.0,2478.0,456.0,0.0,0.0,...,0.0,,,,,,0.0,0.0,,
3,4,8200.0,Amselweg,Whg 37 Strom,2173.0,364.0,2879.0,456.0,0.0,0.0,...,0.0,,,,,,0.0,0.0,,
4,5,8200.0,Amselweg,Whg 34 Strom,0.0,0.0,1130.0,456.0,0.0,0.0,...,0.0,,,,,,0.0,0.0,,


|Variable          | Description|
|------------------|------------|
|ID	Unique         |customer ID |
|PLZ               |address information: postal code|
|City              |address information: city|
|Strasse           |address information: street|
|Betreff           |miscellaneous information about the meter and the housing|
|Cons_2011;Cons_2012;Cons_2013 |consumption in kWh per year|
|Days_2011;Days_2012;Days_2013 | days in one year in which the consumption was created|
|FilterNonHousehold |a filter created by the utility; the company is not sure if it covers all non-households|
|Portal            |indicates whether the customer uses the energy efficiency portal|
|pPoints           |points on the portal||
pEarnedPoint     |	earned points on the porta|l|
pHouseholdTy    |e	type of housi|n|g
pMainHeatingT  |pe	the main heating type of the househ|o|ld
pWaterHeating |ype	the type of water hea|t|ing
pLivingA     |eaM2	the living area of the hous|e|hold
pHouseholdM |mbers	the number of people living in the hou|s|ehold
p          |reated	timestamp of account c|r|eation
pLas      |Visited	timestamp of the la|st visit



In [11]:
apc.describe(include='all')

Unnamed: 0,ID,PLZ,Strasse,Betreff,Cons_2011,Days_2011,Cons_2012,Days_2012,Cons_2013,Days_2013,...,pPoints,pEarnedPoints,pHouseholdType,pMainHeatingType,pWaterHeatingType,pLivingAreaM2,pHouseholdMembers,pCreated,pLastVisited,single
count,1299.0,1236.0,1236,1221,1163.0,1234.0,1149.0,1235.0,1118.0,1233.0,...,15.0,15.0,14,14,14,1236.0,1236.0,15,15,1299
unique,,,41,228,,,,,,,...,,,2,5,6,,,14,14,2
top,,,Reuthstraße,Einfamilienhaus,,,,,,,...,,,appartment,gas,electric,,,03.09.12 11:06,03.09.12 13:52,False
freq,,,364,131,,,,,,,...,,,7,7,4,,,2,2,1297
mean,650.0,8276.605987,,,1584.337059,195.187196,2672.342907,316.844534,1721.134168,200.022709,...,295.733333,295.733333,,,,1.660194,0.029935,,,
std,375.13331,494.921571,,,1950.346635,111.361331,2802.995836,147.964263,2432.125878,179.823121,...,261.357573,261.357573,,,,17.749204,0.311575,,,
min,1.0,3600.0,,,-1356.0,-106.0,-563.0,-178.0,-421.0,-90.0,...,0.0,0.0,,,,0.0,0.0,,,
25%,325.5,8200.0,,,548.0,179.0,1083.0,356.0,0.0,0.0,...,113.5,113.5,,,,0.0,0.0,,,
50%,650.0,8400.0,,,1054.0,182.0,1924.0,361.0,1074.5,363.0,...,229.0,229.0,,,,0.0,0.0,,,
75%,974.5,8400.0,,,1960.0,196.0,3324.0,367.0,2499.5,367.0,...,461.0,461.0,,,,0.0,0.0,,,


In [10]:
apc.isnull().sum()

ID                       0
PLZ                     63
Strasse                 63
Betreff                 78
Cons_2011              136
Days_2011               65
Cons_2012              150
Days_2012               64
Cons_2013              181
Days_2013               66
FilterNonHousehold      63
Portal                  63
pPoints               1284
pEarnedPoints         1284
pHouseholdType        1285
pMainHeatingType      1285
pWaterHeatingType     1285
pLivingAreaM2           63
pHouseholdMembers       63
pCreated              1284
pLastVisited          1284
single                   0
dtype: int64

In [13]:
apc.dtypes

ID                      int64
PLZ                   float64
Strasse                object
Betreff                object
Cons_2011             float64
Days_2011             float64
Cons_2012             float64
Days_2012             float64
Cons_2013             float64
Days_2013             float64
FilterNonHousehold    float64
Portal                float64
pPoints               float64
pEarnedPoints         float64
pHouseholdType         object
pMainHeatingType       object
pWaterHeatingType      object
pLivingAreaM2         float64
pHouseholdMembers     float64
pCreated               object
pLastVisited           object
single                   bool
CPD_2011              float64
CPD_2012              float64
CPD_2013              float64
dtype: object

## Data preparation - exercises
1. Inspect the output of the previous chunks. What are problematic values in the data, and how could we handle them?
2. Use the function `value_counts()` to see the distribution of the column `Portal`

In [57]:
# Problem 1: Negative values in the consumption / days
# Problem 2: Maximum of days and consumption is quite large
# solution: we normalize the consumption with the number of days to consumption per day (CPD)
apc['CPD_2011'] = apc['Cons_2011'] / apc['Days_2011']
apc['CPD_2012'] = apc['Cons_2012'] / apc['Days_2012']
apc['CPD_2013'] = apc['Cons_2013'] / apc['Days_2013']

# Problem 3: Missing values in consumption and days is different
# Problem 4: Large number of NA values for points, household type, ...
# Problem 5: a value of 0 makes no sense for living area
# Problem 6: Date created and last modified is the wrong format
# Problem 7: PLZ wrong data format

#create a column with single households
apc['Portal'].value_counts()

Portal
0.0    1221
1.0      15
Name: count, dtype: int64

## First model to estimate the number of household members based on the electricity consumption

In [68]:
# Create linear regression object
regr1 = linear_model.LinearRegression()

data_complete = apc[['CPD_2011', 'CPD_2012', 'CPD_2013', 'Portal']].notnull()

#extract the variables that we need from the dataset
y = data_complete.Portal
X = data_complete[['CPD_2011', 'CPD_2012', 'CPD_2013']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=16)

# We fit a linear regression model
regr1.fit(X_train, y_train)

y_pred = regr1.predict(X_test) < 0.5
cnf_matrix = confusion_matrix(y_test, y_pred)
print(cnf_matrix)

#print(regr1.predict(X_test))

print(classification_report(y_test, y_pred))


[[ 17   0]
 [308   0]]
              precision    recall  f1-score   support

       False       0.05      1.00      0.10        17
        True       0.00      0.00      0.00       308

    accuracy                           0.05       325
   macro avg       0.03      0.50      0.05       325
weighted avg       0.00      0.05      0.01       325



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [64]:

# instantiate the model (using the default parameters)
logreg = LogisticRegression(random_state=16)

# fit the model with data
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
cnf_matrix = confusion_matrix(y_test, y_pred)
cnf_matrix

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

       False       0.89      1.00      0.94        17
        True       1.00      0.99      1.00       308

    accuracy                           0.99       325
   macro avg       0.95      1.00      0.97       325
weighted avg       0.99      0.99      0.99       325

