In [8]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
data = pd.read_csv("NFLX.csv")
data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2010-01-04,7.931429,7.961429,7.565714,7.640000,7.640000,17239600
1,2010-01-05,7.652857,7.657143,7.258571,7.358571,7.358571,23753100
2,2010-01-06,7.361429,7.672857,7.197143,7.617143,7.617143,23290400
3,2010-01-07,7.731429,7.757143,7.462857,7.485714,7.485714,9955400
4,2010-01-08,7.498571,7.742857,7.465714,7.614286,7.614286,8180900
...,...,...,...,...,...,...,...
3114,2022-05-17,189.169998,191.399994,185.169998,190.559998,190.559998,9876700
3115,2022-05-18,186.720001,187.699997,176.270004,177.190002,177.190002,9665600
3116,2022-05-19,178.050003,186.300003,175.710007,183.479996,183.479996,10448500
3117,2022-05-20,185.869995,190.190002,179.770004,186.350006,186.350006,10422600


# Data Exploration 

## a) OverView

### 1. General Overview

In [9]:
# Check the structure of the DataFrame
print("DataFrame Info:")
data.info()

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3119 entries, 0 to 3118
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       3119 non-null   object 
 1   Open       3119 non-null   float64
 2   High       3119 non-null   float64
 3   Low        3119 non-null   float64
 4   Close      3119 non-null   float64
 5   Adj Close  3119 non-null   float64
 6   Volume     3119 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 170.7+ KB


In [10]:
# Check the number of rows and columns
print(f"\nNumber of Rows: {data.shape[0]}")
print(f"Number of Columns: {data.shape[1]}")


Number of Rows: 3119
Number of Columns: 7


In [11]:
# Display the first 5 rows of the DataFrame
print("\nSample Entries:")
data.head()


Sample Entries:


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2010-01-04,7.931429,7.961429,7.565714,7.64,7.64,17239600
1,2010-01-05,7.652857,7.657143,7.258571,7.358571,7.358571,23753100
2,2010-01-06,7.361429,7.672857,7.197143,7.617143,7.617143,23290400
3,2010-01-07,7.731429,7.757143,7.462857,7.485714,7.485714,9955400
4,2010-01-08,7.498571,7.742857,7.465714,7.614286,7.614286,8180900


### 3 Missing Values

In [12]:
# Check for missing values
print("\nMissing Values:")
missing_values = data.isnull().sum()
print(missing_values)


Missing Values:
Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


In [13]:
# Percentage of missing values per column (if needed)
missing_percentage = (missing_values / data.shape[0]) * 100
print("\nPercentage of Missing Values:")
print(missing_percentage)



Percentage of Missing Values:
Date         0.0
Open         0.0
High         0.0
Low          0.0
Close        0.0
Adj Close    0.0
Volume       0.0
dtype: float64


### 3. Duplicate Entries

In [14]:
# Describe numerical columns to identify potential outliers
print("\nStatistical Summary:")
summary = data.describe()
print(summary)


Statistical Summary:
              Open         High          Low        Close    Adj Close  \
count  3119.000000  3119.000000  3119.000000  3119.000000  3119.000000   
mean    185.774606   188.574990   182.773507   185.747629   185.747629   
std     182.675775   185.222112   179.874082   182.575796   182.575796   
min       6.960000     7.178571     6.931429     7.018571     7.018571   
25%      33.255714    33.815000    32.611429    33.328571    33.328571   
50%     100.209999   102.110001    98.529999   100.230003   100.230003   
75%     336.800003   342.384995   331.125000   337.539993   337.539993   
max     692.349976   700.989990   686.090027   691.690002   691.690002   

             Volume  
count  3.119000e+03  
mean   1.879939e+07  
std    2.091301e+07  
min    1.144000e+06  
25%    6.233250e+06  
50%    1.232560e+07  
75%    2.342225e+07  
max    3.155418e+08  
