# Predictive Analytics | Assignment 1

### source code repo
- https://github.com/kibambe-0167/time-series-analysis

In [1]:
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
import pandas as pd
import numpy as np
import os, sys
import random

# configs
sns.set_style(style='darkgrid')
np.random.seed(18)
random.seed(18)

# data loading
- load data from different files
- combine the data into a single variable

#### summary
- the total number of samples if 420

In [2]:
BASE_DIR: str = "./data"

In [3]:
filesnames = os.listdir(BASE_DIR)
filesnames = [ f for f in filesnames if f.lower().endswith(".csv") ]
data = list()
for fn in filesnames:
  d = pd.read_csv(f"{BASE_DIR}/{fn}")
  data.append(d)
  
df = pd.concat(data, ignore_index=True)
print(df.shape)
df.head()

(420, 3)


Unnamed: 0,Date,Actual,Goal
0,Jun 28 2022,7225,8000
1,Jun 29 2022,8819,8000
2,Jun 30 2022,5082,8000
3,Jul 1 2022,6819,8000
4,Jul 2 2022,5603,8000


# data cleaning
#### summary
- there is no misssing or na or null data

### check for null value
- there are no null values
- second block check if any of the columns have missing values
- showing rows with at least one null value

In [4]:
df.isnull().sum().to_frame().T 

Unnamed: 0,Date,Actual,Goal
0,0,0,0


In [5]:
df.isnull().any().sum()

np.int64(0)

In [6]:
df[df.isnull().any(axis=1)]  

Unnamed: 0,Date,Actual,Goal


### check for na values
- there are no na values
- second code check if any of the columns have missing values
- third block shows rows with atleast one missing value

In [7]:
df.isna().sum().to_frame().T

Unnamed: 0,Date,Actual,Goal
0,0,0,0


In [8]:
#  if any of the columns have missing values
df.isna().any().sum()

np.int64(0)

In [9]:
df[df.isna().any(axis=1)]  

Unnamed: 0,Date,Actual,Goal


### duplicate samples
- there is no duplicate samples in the datasets 

In [10]:
df.duplicated().sum()

np.int64(0)

### check for the columns data type

In [11]:
df.dtypes

Date      object
Actual     int64
Goal       int64
dtype: object

### info of the data

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Date    420 non-null    object
 1   Actual  420 non-null    int64 
 2   Goal    420 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 10.0+ KB


### show percentage of missing values
-  Returns percentage of missing values for each column

In [13]:
(df.isnull().sum() / len(df)) * 100

Date      0.0
Actual    0.0
Goal      0.0
dtype: float64

# descriptive statistics


#### summary
- the 'Date' column is split into 3 column containing Month, Day and Year to better descriptive analysis

In [14]:
def seperate_date(x, to_extract: int):
  """ to_extract
    0 : for month
    1 : for day
    2 : for year
  """
  x = str(x).lower().split(" ")
  if to_extract >= 0 and to_extract <= 2:
    x = x[to_extract].strip()
    # return numeric values for day and year column
    return int(x) if to_extract > 0 else x
  return "unknown"

#### transform data for descriptive statistics

In [15]:
df_ds = df.copy()

# seperate the month, day, year into it on column for better descriptive analysis
df_ds['Month'] = df_ds['Date'].apply( lambda x: seperate_date(x, 0))
df_ds['Day'] = df_ds['Date'].apply( lambda x: seperate_date(x, 1))
df_ds['Year'] = df_ds['Date'].apply( lambda x: seperate_date(x, 2))

df_ds.head()

Unnamed: 0,Date,Actual,Goal,Month,Day,Year
0,Jun 28 2022,7225,8000,jun,28,2022
1,Jun 29 2022,8819,8000,jun,29,2022
2,Jun 30 2022,5082,8000,jun,30,2022
3,Jul 1 2022,6819,8000,jul,1,2022
4,Jul 2 2022,5603,8000,jul,2,2022


In [16]:
# the data types of column of dataset
df_ds.dtypes

Date      object
Actual     int64
Goal       int64
Month     object
Day        int64
Year       int64
dtype: object

In [17]:
# number of items in the dataset
df_ds.shape[0]

420

### description of data

In [18]:
numeric_samples = df_ds[['Actual', 'Goal', 'Day', 'Year']]

numeric_samples.describe()

Unnamed: 0,Actual,Goal,Day,Year
count,420.0,420.0,420.0,420.0
mean,5674.140476,7682.142857,15.557143,2022.488095
std,2930.565439,613.717635,8.754355,0.500454
min,0.0,6500.0,1.0,2022.0
25%,3634.0,8000.0,8.0,2022.0
50%,5402.0,8000.0,15.5,2022.0
75%,7349.0,8000.0,23.0,2023.0
max,16611.0,8000.0,31.0,2023.0


### sum of numeric columns

In [19]:
numeric_samples.sum().to_frame().T

Unnamed: 0,Actual,Goal,Day,Year
0,2383139,3226500,6534,849445


### median of numeric columns
- a number in the middle of each column

In [20]:
numeric_samples.median().to_frame().T

Unnamed: 0,Actual,Goal,Day,Year
0,5402.0,8000.0,15.5,2022.0


### mode
- mode of the numeric columns
- now able to show mode for Day, because about 24 values have the same frequency

In [21]:
numeric_samples[['Actual', 'Goal', 'Year']].mode()

Unnamed: 0,Actual,Goal,Year
0,0,8000,2022


### variance
- of numeric columns

In [22]:
numeric_samples.var().to_frame().T

Unnamed: 0,Actual,Goal,Day,Year
0,8588214.0,376649.335152,76.638732,0.250455


### z-score
- of numeric columns

In [23]:
(numeric_samples - numeric_samples.mean() ) / numeric_samples.std()

Unnamed: 0,Actual,Goal,Day,Year
0,0.529201,0.517921,1.421333,-0.975304
1,1.073124,0.517921,1.535562,-0.975304
2,-0.202057,0.517921,1.649791,-0.975304
3,0.390662,0.517921,-1.662846,-0.975304
4,-0.024275,0.517921,-1.548617,-0.975304
...,...,...,...,...
415,-0.181242,-1.926200,1.078647,1.022880
416,0.117677,-1.926200,1.192876,1.022880
417,-0.822756,-1.926200,1.307105,1.022880
418,0.283174,-1.926200,1.421333,1.022880


### co-efficient of covariance

to show how the values are around the mean
- high value tell that, more values are around the mean
- lower value tell, that values are far from the mean


In [24]:
cov = numeric_samples.std() / numeric_samples.mean()

print("MEANS: \n", numeric_samples.mean().to_frame().T)
print("\nCoefficient variance: \n\n", cov.to_frame().T)


MEANS: 
         Actual         Goal        Day         Year
0  5674.140476  7682.142857  15.557143  2022.488095

Coefficient variance: 

      Actual      Goal       Day      Year
0  0.516477  0.079889  0.562723  0.000247


### standard error
- show how far the sample data is from the populations data

In [25]:
std_err = numeric_samples.apply(lambda x: np.std(x, ddof=1) / np.sqrt(len(x)))
std_err.to_frame().T

Unnamed: 0,Actual,Goal,Day,Year
0,142.996951,29.946354,0.427169,0.02442


### Confidence level

In [28]:
conf = .95
m = numeric_samples.mean()
n = numeric_samples.shape[0]
stderr = stats.sem(numeric_samples)
interval = stderr * stats.t.ppf( ( 1 + conf ) / 2.0, n - 1)
# 
(n - interval).round(3)

array([138.919, 361.136, 419.16 , 419.952])

# data visualisation

### show word cloud for months

# models

# validation

# compare model with world models

# refelctive analysis
- whats i have learn from doing this assignment