# Data Luxembourg

## Importing Necessary Libraries and Reading Data

In [2]:
from mat4py import loadmat
from datetime import datetime, timedelta

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn-notebook')
pd.set_option('display.max_column', 250)
from matplotlib import rcParams
rcParams['figure.figsize'] = (5, 3)
rcParams['figure.dpi'] = 150

In [3]:
data = loadmat("data/dataLU.mat")
#print(data)
X = pd.DataFrame(data['dataLU']['X']).transpose()
y = pd.DataFrame(data['dataLU']['y'], columns=['label'])
t = pd.DataFrame(data['dataLU']['t'], columns=['date'])
print(X.shape, y.shape, t.shape)

(1901, 31) (1901, 1) (1901, 1)


In [4]:
matlab_datenum = 1257.5833333333333
python_datetime = datetime.fromordinal(int(matlab_datenum)) + timedelta(days=matlab_datenum%1) - timedelta(days = 366)
python_datetime

datetime.datetime(3, 6, 10, 14, 0)

In [5]:
t['date'].apply(lambda matlab_datenum: datetime.fromordinal(int(matlab_datenum)) + timedelta(days=matlab_datenum%1) - timedelta(days = 366))

0       0003-05-09 10:00:00
1       0003-05-10 17:00:00
2       0003-05-11 16:00:00
3       0003-05-11 19:00:00
4       0003-05-12 10:00:00
5       0003-05-12 19:00:00
6       0003-05-13 10:00:00
7       0003-05-13 11:00:00
8       0003-05-13 11:00:00
9       0003-05-13 13:00:00
10      0003-05-13 14:00:00
11      0003-05-13 15:00:00
12      0003-05-13 15:00:00
13      0003-05-13 17:00:00
14      0003-05-13 17:00:00
15      0003-05-13 17:00:00
16      0003-05-13 18:00:00
17      0003-05-13 19:00:00
18      0003-05-13 19:00:00
19      0003-05-14 09:00:00
20      0003-05-14 09:00:00
21      0003-05-14 10:00:00
22      0003-05-14 16:00:00
23      0003-05-14 17:00:00
24      0003-05-14 18:00:00
25      0003-05-14 18:00:00
26      0003-05-14 18:00:00
27      0003-05-14 18:00:00
28      0003-05-14 19:00:00
29      0003-05-15 11:00:00
               ...         
1871    0005-02-01 21:00:00
1872    0005-02-02 15:00:00
1873    0005-02-02 16:00:00
1874    0005-02-02 16:00:00
1875    0005-02-02 1

In [6]:
python_datetime

datetime.datetime(3, 6, 10, 14, 0)

## Info on Dataset

**Name**: Luxembourg Dataset

**Author/Complier:** Indrė Žliobaitė

**Source**: https://sites.google.com/site/zliobaite/resources-1

**Brief Explanation:** Luxembourg dataset is constructing using European Social Survey data. Each instance is an individual. The attributes are formed from answers to the survey questionnaire. The labels indicate high or low internet usage. The dataset has time stamps, the questionnaires are collected over 5 years period. It is expected that internet usage is changing over time (concept drift). 

In [13]:
print("Shape: ", X.shape)
print("No of Class: ", y.nunique())
print("Class balance: \n", y.groupby('label')['label'].count())

Shape:  (1901, 31)
No of Class:  label    2
dtype: int64
Class balance: 
 label
0    977
1    924
Name: label, dtype: int64


## Fit a Classifier

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)

In [17]:
from sklearn.svm import LinearSVC

SVM = LinearSVC()
SVM.fit(train_X, train_y)
print("Training Score: {0:3f}".format(SVM.score(train_X, train_y)),
      "Test Score: {0:3f}".format(SVM.score(val_X, val_y)))

Training Score: 1.000000 Test Score: 1.000000


  y = column_or_1d(y, warn=True)


In [21]:
svm_year1 = LinearSVC().fit(X[:380],y.label[:380])
print("Score for Y1: {0:3f}".format(svm_year1.score(X[:380],y.label[:380])),
      "Score for Y2: {0:3f}".format(svm_year1.score(X[380:760],y.label[380:760])),
      "Score for Y3: {0:3f}".format(svm_year1.score(X[760:1140],y.label[760:1140])),
      "Score for Y4: {0:3f}".format(svm_year1.score(X[1140:],y.label[1140:])))

Score for Y1: 1.000000 Score for Y2: 1.000000 Score for Y3: 1.000000 Score for Y4: 1.000000


In [7]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression().fit(X[:100], y.label[:100])



In [8]:
logreg.score(X[:-100], y.label[:-100])

0.9544697390338701