<a href="https://colab.research.google.com/github/Krittipoom/FOR-fake-or-real-dataset-classification/blob/main/code/notebook/Part1%20Transform%20Raw%20Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load Library and Github repo

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
from PIL import Image
from google.colab import files

In [2]:
!git clone https://github.com/Krittipoom/amarin-customer-segment.git
!mv ./amarin-customer-segment/code/* ./
!rm -r ./amarin-customer-segment/

Cloning into 'amarin-customer-segment'...
remote: Enumerating objects: 205, done.[K
remote: Counting objects: 100% (140/140), done.[K
remote: Compressing objects: 100% (119/119), done.[K
remote: Total 205 (delta 96), reused 30 (delta 20), pack-reused 65[K
Receiving objects: 100% (205/205), 61.68 MiB | 31.87 MiB/s, done.
Resolving deltas: 100% (103/103), done.


# Transform Raw Data



In [3]:
print('extracting raw data ...')
!tar -xf data.tar.xz
!ls data.csv
print('data.csv <-- Raw Data')

chunksize = 10 ** 6
df = pd.DataFrame()
for chunk in pd.read_csv('data.csv', chunksize=chunksize, skipinitialspace=True):
    df = pd.concat([df, chunk], ignore_index=True)
del df['Unnamed: 3']

df.head()

extracting raw data ...
data.csv
data.csv <-- Raw Data


Unnamed: 0,alias_hw_id,address,timestamp
0,A1,01:40:9f:84:86:6e,2023-06-27 10:01
1,A1,01:74:99:ea:3d:46,2023-06-27 10:01
2,A1,05:2a:17:9b:0f:fe,2023-06-27 10:01
3,A1,05:b3:d7:a5:21:bb,2023-06-27 10:01
4,A1,07:48:ff:e8:b9:d1,2023-06-27 10:01


In [4]:
before = len(df)
print('len(df) before drop duplicates:\t',before)
df = df.drop_duplicates()
print('len(df) after drop duplicates:\t',len(df))
print('before - after =\t', before - len(df))

len(df) before drop duplicates:	 11681246
len(df) after drop duplicates:	 3541294
before - after =	 8139952


In [5]:
df.loc[df['alias_hw_id'] == 'A1', 'location'] = 'นายอินทร์'
df.loc[df['alias_hw_id'] == 'A2', 'location'] = 'ร้านอาหาร'
df.loc[df['alias_hw_id'] == 'A3', 'location'] = 'relax zone'
df.loc[df['alias_hw_id'] == 'A4', 'location'] = 'ทางเดินกลางงาน'
df.loc[df['alias_hw_id'] == 'A5', 'location'] = 'stage'
df.loc[df['alias_hw_id'] == 'A6', 'location'] = 'kidszoona'
df.loc[df['alias_hw_id'] == 'A7', 'location'] = 'จุดแลกของสมนาคุณ'
df.loc[df['alias_hw_id'] == 'A8', 'location'] = 'เครื่องใช้ไฟฟ้า'
df.loc[df['alias_hw_id'] == 'A9', 'location'] = 'บูธหลัก'
df.loc[df['alias_hw_id'] =='A10', 'location'] = 'book yard'
df.loc[df['alias_hw_id'] == 'B1', 'location'] = 'แข่งคลาน&บริการรถเข็น'
df.loc[df['alias_hw_id'] == 'B2', 'location'] = 'ประตู101'
df.loc[df['alias_hw_id'] == 'B3', 'location'] = 'cool kids camp'
df.loc[df['alias_hw_id'] == 'B5', 'location'] = 'ประตู102'

df.loc[df['alias_hw_id'] == 'A1', 'alias_with_location'] = '(A1) นายอินทร์'
df.loc[df['alias_hw_id'] == 'A2', 'alias_with_location'] = '(A2) ร้านอาหาร'
df.loc[df['alias_hw_id'] == 'A3', 'alias_with_location'] = '(A3) relax zone'
df.loc[df['alias_hw_id'] == 'A4', 'alias_with_location'] = '(A4) ทางเดินกลางงาน'
df.loc[df['alias_hw_id'] == 'A5', 'alias_with_location'] = '(A5) stage'
df.loc[df['alias_hw_id'] == 'A6', 'alias_with_location'] = '(A6) kidszoona'
df.loc[df['alias_hw_id'] == 'A7', 'alias_with_location'] = '(A7) จุดแลกของสมนาคุณ'
df.loc[df['alias_hw_id'] == 'A8', 'alias_with_location'] = '(A8) เครื่องใช้ไฟฟ้า'
df.loc[df['alias_hw_id'] == 'A9', 'alias_with_location'] = '(A9) บูธหลัก'
df.loc[df['alias_hw_id'] =='A10', 'alias_with_location'] = '(A10) book yard'
df.loc[df['alias_hw_id'] == 'B1', 'alias_with_location'] = '(B1) แข่งคลาน&บริการรถเข็น'
df.loc[df['alias_hw_id'] == 'B2', 'alias_with_location'] = '(B2) ประตู101'
df.loc[df['alias_hw_id'] == 'B3', 'alias_with_location'] = '(B3) cool kids camp'
df.loc[df['alias_hw_id'] == 'B5', 'alias_with_location'] = '(B5) ประตู102'

df = df.rename(columns={'alias_hw_id': 'alias'})

df[['date', 'time']] = df['timestamp'].str.split(' ', n=1, expand=True)
df[['hours', 'minutes']] = df['time'].str.split(':', n=1, expand=True)

df = df[['address', 'date', 'time', 'hours', 'minutes', 'alias_with_location', 'alias', 'location']]
df.sort_values(by=['address','date','hours','minutes'], inplace=True,
               ascending = [True, True, True, True])

df.head()

Unnamed: 0,address,date,time,hours,minutes,alias_with_location,alias,location
3797823,00:00:00:00:00:00,2023-06-30,03:50,3,50,(A6) kidszoona,A6,kidszoona
6055561,00:00:00:00:00:00,2023-06-30,13:49,13,49,(B1) แข่งคลาน&บริการรถเข็น,B1,แข่งคลาน&บริการรถเข็น
6506639,00:00:00:00:00:00,2023-07-01,03:09,3,9,(B5) ประตู102,B5,ประตู102
6508003,00:00:00:00:00:00,2023-07-01,03:10,3,10,(B5) ประตู102,B5,ประตู102
7623774,00:00:00:00:00:00,2023-07-01,08:44,8,44,(B5) ประตู102,B5,ประตู102


save data

In [6]:
df.to_csv('transformed_data.csv', sep=',', encoding='utf-8', index=False)
files.download('transformed_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>