## Data Preparation

In [144]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from dateutil import tz
from datetime import date
import calendar
%matplotlib inline

In [24]:
# Data import and previous treatment
regColumns = ['day','month','year','client','vehicle','plate','value','service']
dayColumns = ['date','day','month','year','value']
monthColumns = ['date','month','year','value']

regData= pd.read_csv('registros.csv', encoding='latin-1', header=0, sep=';', names=regColumns)
dayData = pd.read_csv('soma_diaria.csv', encoding='latin-1', header=0, sep=';', names=dayColumns)
monthData = pd.read_csv('soma_mensal.csv', encoding='latin-1', header=0, sep=';', names=monthColumns)

## RegData preparation

In [25]:
regData.head(10)

Unnamed: 0,day,month,year,client,vehicle,plate,value,service
0,1,4,20,JULIAN SILVEIRA,RANGER,IXU7D08,30,
1,2,4,20,JULIAN SILVEIRA,RANGER,IXU7D08,70,
2,2,4,20,GIOVANE BANCOS,FOCUS,NIN6C91,100,
3,2,4,20,,Fox,ITO7654,40,
4,2,4,20,MONTANHA,PALIO,IVO6973,20,
5,2,4,20,SEM NOME,,ITF9873,40,
6,2,4,20,JUNIOR,UNO,IQM4554,20,
7,2,4,20,ZETI,ECOSPORT,IWH6997,40,
8,2,4,20,MURIEL,OMEGA,JKV7B27,40,
9,2,4,20,SEM NOME,FOX,IUF5691,40,


In [26]:
# Fill blank spaces with nan
regData.service = regData.service.replace(r'^\s+$', np.nan, regex=True)

In [32]:
# Check first non NaN line
for index,row in enumerate(regData.service.isnull()):
    if row != True:
        print(row)
        print(index)
        print(' ')
        break

False
498
 


In [47]:
# Creating new dataset starting from the first use of "service" column
regDataNew = regData.loc[498:]

In [55]:
# Putting 20 before year
regDataNew.year ='20'+regDataNew.year 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [62]:
# Converting to strings to merge columns
regDataNew.day = regDataNew.day.astype(str)
regDataNew.month = regDataNew.month.astype(str)
regDataNew.year = regDataNew.year.astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [64]:
# Merging columns
regDataNew['date']=regDataNew.year+'-'+regDataNew.month+'-'+regDataNew.day

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regDataNew['date']=regDataNew.year+'-'+regDataNew.month+'-'+regDataNew.day


In [111]:
# Treating typing errors
regDataNew = regDataNew.replace('88','8').replace('2020-8-88','2020-8-8')
regDataNew = regDataNew.replace('200-10-17','2020-10-17')
regDataNew = regDataNew.replace('2020-20-1','2020-12-1')
regDataNew = regDataNew.replace('2010-11-14','2020-11-14')
regDataNew = regDataNew.replace('2010-12-3','2020-12-3')
regDataNew = regDataNew.replace('202-2-19','2021-2-19')

In [117]:
# Finding typing errors
for index, row in regDataNew.iterrows():
    if int(row.day) > 31:
        print('Day error, index: ', index)
    elif int(row.month) > 12:
        print('Month error, index: ', index)
    elif (int(row.year) > 2021) | (int(row.year) < 2020):
        print('Year error, index: ', index)

Year error, index:  1492
Year error, index:  1762
Month error, index:  1908
Year error, index:  1916
Year error, index:  2513


In [138]:
# Converting date to pandas datetime object
regDataNew.date = pd.to_datetime(regDataNew.date)

In [121]:
# Droping old date columns
regDataFinal = regDataNew.drop(['day','month','year'],axis=1)

In [133]:
regDataFinal.head()

Unnamed: 0,client,vehicle,plate,value,service,date
498,GIOVANNE,FOCUS,NTN6C91,40,COMPLETA,2020-07-01
499,ROGERIO,ONIX,RDS8F01,40,COMPLETA,2020-07-01
500,TAXI MESSINHO,ONIX,IZV3H78,12,EXPRESSA,2020-07-01
501,MIGUEL CONSERTCAR,D20,ICR4980,100,COMPLETA,2020-07-02
502,LINDOLFO,CELTA,JBW0120,60,GERAL,2020-07-02


## Day Data Preparation

In [171]:
dayData.head()

Unnamed: 0,date,day,month,year,value
0,24/04/2020 00:00,24.0,4.0,20.0,200
1,25/04/2020 00:00,25.0,4.0,20.0,340
2,25/04/2020 00:00,15.0,4.0,20.0,160
3,26/04/2020 00:00,1.0,4.0,20.0,0
4,26/04/2020 00:00,25.0,4.0,20.0,340


In [173]:
# Check 'NaN' values
dayData.loc[dayData.date.isna() | dayData.day.isna() | dayData.month.isna() | dayData.year.isna() | dayData.value.isna()]  

Unnamed: 0,date,day,month,year,value
36,28/05/2020 00:00,28.0,,20.0,0
62,11/07/2020 00:00,,,,0
143,19/08/2020 00:00,,,,0
261,28/10/2020 00:00,,,,0
493,09/03/2021 00:00,,,,0
515,23/03/2021 00:00,,,,0


In [180]:
# Drop NaN lines
dayData = dayData.dropna(axis=0, how='any')

In [184]:
# Convert to int
dayData[['day','month','year']] = dayData[['day','month','year']].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [190]:
# Convert to pandas datetime object
dayData.date = pd.to_datetime(dayData.date)

In [194]:
# Droping old date columns
dayDataFinal = dayData.drop(['day','month','year'],axis=1)

In [196]:
dayDataFinal.head()

Unnamed: 0,date,value
0,2020-04-24,200
1,2020-04-25,340
2,2020-04-25,160
3,2020-04-26,0
4,2020-04-26,340


## Month Data Preparation

In [217]:
monthData.head()

Unnamed: 0,date,month,year,value
0,24/04/2020 00:00,4.0,20.0,3945
1,25/04/2020 00:00,4.0,20.0,4285
2,26/04/2020 00:00,4.0,20.0,4285
3,27/04/2020 00:00,4.0,20.0,4745
4,28/04/2020 00:00,4.0,20.0,4945


In [218]:
# Check 'NaN' values
monthData.loc[monthData.date.isna() | monthData.month.isna() | monthData.year.isna() | monthData.value.isna()]  

Unnamed: 0,date,month,year,value
28,07/05/2020 00:00,,,0
29,07/05/2020 00:00,,,0
42,23/05/2020 00:00,5.0,,0
151,15/07/2020 00:00,,,0
500,05/11/2020 00:00,,,0
501,05/11/2020 00:00,,,0
502,05/11/2020 00:00,,,0
540,17/11/2020 00:00,,,0
541,17/11/2020 00:00,,,0
999,14/04/2021 00:00,,,0


In [219]:
# Drop NaN lines
monthData = monthData.dropna(axis=0, how='any')

In [220]:
# Convert to int
monthData[['month','year']] = monthData[['month','year']].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [221]:
# Convert to pandas datetime object
monthData.date = pd.to_datetime(monthData.date)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [222]:
# Droping old date columns
monthDataFinal = monthData.drop(['month','year'],axis=1)

In [223]:
monthDataFinal.head()

Unnamed: 0,date,value
0,2020-04-24,3945
1,2020-04-25,4285
2,2020-04-26,4285
3,2020-04-27,4745
4,2020-04-28,4945


## Data Exportation

In [None]:
regData= pd.read_csv('registros.csv', encoding='latin-1', header=0, sep=';', names=regColumns)
dayData = pd.read_csv('soma_diaria.csv', encoding='latin-1', header=0, sep=';', names=dayColumns)
monthData = pd.read_csv('soma_mensal.csv', encoding='latin-1', header=0, sep=';', names=monthColumns)

In [224]:
regDataFinal.to_csv(r'D:\Learning\EstudosDataScience\Projetos\DataScienceProjects\20210816 Lavacar\3 - Dados de Upload\20210829\regDataset.csv', index=False, encoding='latin-1', sep=';')
dayDataFinal.to_csv(r'D:\Learning\EstudosDataScience\Projetos\DataScienceProjects\20210816 Lavacar\3 - Dados de Upload\20210829\dayDataset.csv', index=False, encoding='latin-1', sep=';')
monthDataFinal.to_csv(r'D:\Learning\EstudosDataScience\Projetos\DataScienceProjects\20210816 Lavacar\3 - Dados de Upload\20210829\monthDataset.csv', index=False, encoding='latin-1', sep=';')