# PROJECT NAME: STOCK MARKET PRICES PREDICTION

## TEAM NAME: GROUP 3

## TA: PERCY

## GROUP MEMBERS: 
### 1. Ayobami Adeniyi
### 2. Ayodeji Anibaba
### 3. Chisom Eluwa
### 4. Khaleed Oyeleke
### 5. Ronke Akinmosin
### 6. Umar Idris

# Table of Contents
### [1. Introduction](#intro)
1.1. Project overview

1.2. Project statement
### [2. Import packages and modules](#import)
### [3. Data collection](#Webscraping)
### [4. Load data](#load)
### [5. Data description](#dd)
5.1. A peek into our training data

5.2. Training data basic statistics

5.3. Additional data available
### [6. Data cleaning](#cleaning)
6.1. Missing values imputation

6.2. Remove duplicates
### [7. Exploratory Data Analysis](#eda)

### [8. Modelling](#pp)


### [9. Conclusion](#conc)

### [1. Introduction](#intro)
Project overview

Project statement

### [2. Import packages and modules](#import)

In [None]:
#!pip install plotly==4.14.3

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import mplfinance as mpf
import requests
import json
from datetime import datetime
%matplotlib inline
import plotly.graph_objects as go


### [3. Data collection](#Webscraping)

In [None]:
#data url gotten via API from yahoo finance
url_4 = "https://apidojo-yahoo-finance-v1.p.rapidapi.com/stock/v2/get-chart"

querystring_4 = {"interval":"1d","symbol":"NIO","range":"10y","region":"US"}

headers_4 = {
    'x-rapidapi-key': "243ef897b3msh844988ee7a01b7fp192ddajsnbd64c87292e2",
    'x-rapidapi-host': "apidojo-yahoo-finance-v1.p.rapidapi.com"
    }

response_4 = requests.request("GET", url_4, headers=headers_4, params=querystring_4)

In [None]:
#printing data in json format
def print_json(response_4):
    word = json.dumps(response_4, sort_keys=True, indent=4)
    
    print(word)

    
print_json(response_4.json())  

In [None]:
#renaming data
data_4=response_4.json()

In [None]:
timestam_4 = data_4["chart"]["result"][0]["timestamp"]
Clos_4 = data_4["chart"]["result"][0]["indicators"]["quote"][0]["close"]
Ope_4 = data_4["chart"]["result"][0]["indicators"]["quote"][0]["open"]
lo_4 = data_4["chart"]["result"][0]["indicators"]["quote"][0]["low"]
hig_4 = data_4["chart"]["result"][0]["indicators"]["quote"][0]["high"]
Volum_4 = data_4["chart"]["result"][0]["indicators"]["quote"][0]["volume"]
AdjClos_4 = data_4["chart"]["result"][0]["indicators"]["adjclose"][0]['adjclose']

In [None]:
dict_of_lists_4 = {"timestamp":timestam_4,
                "Close":Clos_4,
                "Open":Ope_4,
                "Low":lo_4,
                "High":hig_4,
                "Volume":Volum_4,
                  "AdjClose":AdjClos_4 }

In [None]:
#converting data to a dataframe
Nio_df = pd.DataFrame(dict_of_lists_4)

In [None]:
#converting timestamp column to the desired datetime format
Nio_df['timestamp'] = Nio_df['timestamp'].apply(lambda x: datetime.fromtimestamp(x))
Nio_df['timestamp'] = Nio_df['timestamp'].dt.strftime('%Y-%m-%d')

In [None]:
#calling data
Nio_df

In [None]:
#saving data as csv file
Nio_df.to_csv("NIO_STOCK_DATA.csv")

In [None]:
#reading our dataframe and removing 'unnamed 0' column
NIO_2yrs = pd.read_csv("NIO_STOCK_DATA.csv", index_col=0)
#renaming date column as timestamp
NIO_2yrs["Date"] = NIO_2yrs["timestamp"]

# 4. Load Data

In [None]:
#loading data
data= pd.read_csv("NIO_STOCK_DATA.csv", index_col=0)
data

In [None]:
data.head()#check first five rows of dataset

In [None]:
data.tail()#check last five rows of dataset

# 5. Data description
This dataset consists of 2 years data of NIO stock Timestamp, Close prices, Open prices, Low prices, High prices, Volume and AdjClose prices respectively.Stock market opens on weekdays only this does not include public holidays like martin luther king day . This data is available for public usage and research for free

**Timestamp - This is the respective dates for each market day**

**Open prices - This is the opeing price for each market day**

**Close prices - This is the closing price for each market day**

**Low prices- This is the lowest the price reached for each market day**

**High prices - This is the highest the price reached for each market day**

**Volume - This is the quantity of stocks sold and bought that day**

**Adjusted Closing Price - The adjusted closing price amends a stock's closing price to reflect that stock's value after accounting for any corporate actions. Corporate actions occurs during the period between the market closing and opening the next day**


### Source of data:
The data for this stock is maintained on yahoo finance API.

# 6. Data Cleaning

In [None]:
#inspecting data
data.info()

In [None]:
#checking column statistics of dataset
data.describe(include='all')

In [None]:
#checking for missing values
data.isnull().sum()

### there are no missing values

# 7. Exploratory Data Analysis

In [None]:
#converting datatype of timestamp to datetime
data['timestamp']=pd.to_datetime(data['timestamp'])
data

In [None]:
#creating a column for weekdays before setting timestamp as index and also creating a column indicating the week number e.g week 1
data['Weekday']=data['timestamp'].dt.day_name()
data

In [None]:
# we make the Date column the index so as to use Date slicing below
# run this code box just once, other times would return a KeyError
data = data.set_index("timestamp")

### Interactive Analysis: a view of the stock pattern over the last three years

In [None]:
figure = go.Figure(
        data=[
            go.Candlestick(
            x = data.index,
            low = data['Low'],
            high = data['High'],
            open = data['Open'],
            close = data['Close'],
            increasing_line_color='green',
            decreasing_line_color='red'
            )
        ]
)
# figure.update_layout(
#     title="NIO Historic data 2018-2021",
#     yaxis_title = "NIO Stock Price USD ($)",
#     yaxis_title = "Date"
# )

figure.show()

In [None]:
mpf.plot(data,type='line',figratio=(20,12),volume=True)

### YEAR 2019 stock analysis

In [None]:
#loading stock data for 2019
NIO_2019 = data["2019":"2019"]# This slices from the beginnning of 2019 to the end
                              #as you can see from the head and tail functions
NIO_2019.head()

In [None]:
NIO_2019.tail()

In [None]:
# line plot showing relationship between the open and close price for the year
plt.style.use("ggplot")
fig, ax = plt.subplots()
fig.set_size_inches([30, 15])
ax.plot(data.index, data["Close"], color='blue', marker="v", linestyle="--")
ax.set_xlabel("Date")
ax.set_ylabel("Close Price", color='blue')
ax.set_xticklabels(data.index, rotation=90)

ax2 = ax.twinx()
ax2.plot(data.index, data["Open"], color='red', marker="o")
ax2.set_ylabel("Open Price", color='red')
 #save the picture on your local machine to visualize properly
fig.savefig("NIO_2019.png", dpi=300)

In [None]:
#plotting candle charts for the year
mpf.plot(data['2019-01':'2019-12'],type='candle',mav=(20),
         title='Nio Price 2019',volume=True,style='yahoo',figratio=(20,12),tight_layout=True)

In [None]:
#visualising stock volume for each day of the week in year 2019
order=['Monday','Tuesday','Wednesday','Thursday','Friday']
sns.set(rc={'figure.figsize':(15,7)})
sns.boxplot(data=NIO_2019,x='Weekday',y='Volume',order=order);


### visualisation on quarterly basis
#### 1st quarter

In [None]:
#loading data for first quarter
df_1=data["2019-1":"2019-3"]
df_1

In [None]:
#Plotting Candle chart for first quarter of 2019
mpf.plot(df_1,type='candle',mav=(50),
         title='NIO Price 2019_first quarter',volume=True,style='yahoo',
         figratio=(20,12),tight_layout=True)


In [None]:
#finding the correlation between the prices for the month of january
first_quarter_corr=df_1.corr()
sns.heatmap(first_quarter_corr,annot=True,cmap="seismic",
           xticklabels=first_quarter_corr.columns.values,
           yticklabels=first_quarter_corr.columns.values)
plt.title('Prices correlation for the first quarter of 2019')
plt.show()
print(first_quarter_corr)

In [None]:
#checking out volume traded on each day of the week for the first quarter
order=['Monday','Tuesday','Wednesday','Thursday','Friday']
sns.set(rc={'figure.figsize':(15,7)})
sns.boxplot(data=df_1,x='Weekday',y='Volume',order=order);


#### checking for outliers in each column


In [None]:
#checking for outliers in the close price column
sns.set(rc={'figure.figsize':(10,7)})
sns.boxplot(y=df_1['Close']);
plt.show()

In [None]:
#checking for outliers in the open price column
sns.set(rc={'figure.figsize':(10,7)})
sns.boxplot(y=df_1['Open']);
plt.show()

In [None]:
#checking for outliers in the low price column
sns.set(rc={'figure.figsize':(15,7)})
sns.boxplot(y=df_1['Low']);
plt.show()

In [None]:
#checking the outliers in the high price column
sns.set(rc={'figure.figsize':(15,7)})
sns.boxplot(y=df_1['High']);
plt.show()

In [None]:
#checking for outliers in the volume column
sns.set(rc={'figure.figsize':(15,7)})
sns.boxplot(y=df_1['Volume']);
plt.show()

In [None]:
#scatter plot showing relationship between open and close prices for the year
df_1.plot.scatter(x='Open',y='Close',c=('red'))
#obtaining the slop and intercept of regression line
x=df_1.Open
y=df_1.Close
m,b=np.polyfit(x,y,1)
#creating a regression line on the plot
plt.plot(x,m*x+b);

In [None]:
#scatter plot showing relationship between open and low prices
df_1.plot.scatter(x='Open',y='Low',c=('red'))
#obtaining the slop and intercept of regression line
x=df_1.Open
y=df_1.Low
m,b=np.polyfit(x,y,1)
#creating a regression line on the plot
plt.plot(x,m*x+b);

In [None]:
#scatter plot showing relationship between open and low prices
df_1.plot.scatter(x='Close',y='High',c=('red'))
#obtaining the slop and intercept of regression line
x=df_1.Close
y=df_1.High
m,b=np.polyfit(x,y,1)
#creating a regression line on the plot
plt.plot(x,m*x+b);

In [None]:
#scatter plot showing relationship between High and low prices
df_1.plot.scatter(x='High',y='Low',c=('red'))
#obtaining the slop and intercept of regression line
x=df_1.High
y=df_1.Low
m,b=np.polyfit(x,y,1)
#creating a regression line on the plot
plt.plot(x,m*x+b);

In [None]:
#scatter plot showing relationship between close and low prices
df_1.plot.scatter(x='Close',y='Low',c=('red'))
#obtaining the slop and intercept of regression line
x=df_1.Close
y=df_1.Low
m,b=np.polyfit(x,y,1)
#creating a regression line on the plot
plt.plot(x,m*x+b);

### 2nd Quarter

In [None]:
#loading data for second quarter
df_2=data["2019-4":"2019-6"]
df_2

In [None]:
#Plotting Candle chart for second quarter of 2019
mpf.plot(df_2,type='candle',mav=(50),
         title='NIO Price 2019_second quarter',volume=True,style='yahoo',
         figratio=(20,12),tight_layout=True)


In [None]:
#finding the correlation between the prices for the 2nd quarter of 2019
second_quarter_corr=df_2.corr()
sns.heatmap(second_quarter_corr,annot=True,cmap="seismic",
           xticklabels=second_quarter_corr.columns.values,
           yticklabels=second_quarter_corr.columns.values)
plt.title('Prices correlation for the second quarter of 2019')
plt.show()
print(second_quarter_corr)

In [None]:
#checking out volume traded on each day of the week for the second quarter
order=['Monday','Tuesday','Wednesday','Thursday','Friday']
sns.set(rc={'figure.figsize':(15,7)})
sns.boxplot(data=df_2,x='Weekday',y='Volume',order=order);


#### checking for outliers in each column

In [None]:
#checking for outliers in the close column
sns.set(rc={'figure.figsize':(15,7)})
sns.boxplot(y=df_2['Close']);
plt.show()

In [None]:
#checking for outliers in the open column
sns.set(rc={'figure.figsize':(15,7)})
sns.boxplot(y=df_2['Open']);
plt.show()

In [None]:
#checking for outliers in the low price column
sns.set(rc={'figure.figsize':(15,7)})
sns.boxplot(y=df_2['Low']);
plt.show()

In [None]:
#checking for outliers in the high price column
sns.set(rc={'figure.figsize':(15,7)})
sns.boxplot(y=df_2['High']);
plt.show()

In [None]:
#checking for outliers in the volume column
sns.set(rc={'figure.figsize':(15,7)})
sns.boxplot(y=df_2['Volume']);
plt.show()

#### scatter plots showing relationship between each column

In [None]:
#scatter plot showing relationship between open and close prices
df_2.plot.scatter(x='Open',y='Close',c=('green'))
#obtaining the slop and intercept of regression line
x=df_2.Open
y=df_2.Low
m,b=np.polyfit(x,y,1)
#creating a regression line on the plot
plt.plot(x,m*x+b);

In [None]:
#scatter plot showing relationship between open and low prices
df_2.plot.scatter(x='Open',y='Low',c=('green'))
#obtaining the slop and intercept of regression line
x=df_2.Open
y=df_2.Low
m,b=np.polyfit(x,y,1)
#creating a regression line on the plot
plt.plot(x,m*x+b);

In [None]:
#scatter plot showing relationship between close and high prices
df_2.plot.scatter(x='Close',y='High',c=('green'))
#obtaining the slop and intercept of regression line
x=df_2.Close
y=df_2.High
m,b=np.polyfit(x,y,1)
#creating a regression line on the plot
plt.plot(x,m*x+b);

In [None]:
#scatter plot showing relationship between high and low prices
df_2.plot.scatter(x='High',y='Low',c=('green'))
#obtaining the slop and intercept of regression line
x=df_2.Open
y=df_2.Low
m,b=np.polyfit(x,y,1)
#creating a regression line on the plot
plt.plot(x,m*x+b);

In [None]:
#scatter plot showing relationship between close and low prices
df_2.plot.scatter(x='Close',y='Low',c=('green'))
#obtaining the slop and intercept of regression line
x=df_2.Close
y=df_2.Low
m,b=np.polyfit(x,y,1)
#creating a regression line on the plot
plt.plot(x,m*x+b);

### 3rd Quarter

In [None]:
#loading data for third quarter
df_3=data["2019-7":"2019-9"]
df_3

In [None]:
#Plotting Candle chart for third quarter of 2019
mpf.plot(df_3,type='candle',mav=(50),
         title='NIO Price 2019_third quarter',volume=True,style='yahoo',
         figratio=(20,12),tight_layout=True)


In [None]:
#finding the correlation between the prices for the 3rd quarter of 2019
third_quarter_corr=df_3.corr()
sns.heatmap(third_quarter_corr,annot=True,cmap="seismic",
           xticklabels=third_quarter_corr.columns.values,
           yticklabels=third_quarter_corr.columns.values)
plt.title('Prices correlation for the third quarter of 2019')
plt.show()
print(third_quarter_corr)

In [None]:
#checking out volume traded on each day of the week for the third quarter
order=['Monday','Tuesday','Wednesday','Thursday','Friday']
sns.set(rc={'figure.figsize':(15,7)})
sns.boxplot(data=df_3,x='Weekday',y='Volume',order=order);


#### checking for outliers in each column

In [None]:
#checking for outliers in the close price column
sns.set(rc={'figure.figsize':(15,7)})
sns.boxplot(y=df_3['Close']);
plt.show()

In [None]:
#checking for outliers in the open price column
sns.set(rc={'figure.figsize':(15,7)})
sns.boxplot(y=df_3['Open']);
plt.show()

In [None]:
#checking for outliers in the low price column
sns.set(rc={'figure.figsize':(15,7)})
sns.boxplot(y=df_3['Low']);
plt.show()

In [None]:
#checking for outliers in the high price column
sns.set(rc={'figure.figsize':(15,7)})
sns.boxplot(y=df_3['High']);
plt.show()

In [None]:
#checking for outliers in the volume column
sns.set(rc={'figure.figsize':(15,7)})
sns.boxplot(y=df_3['Volume']);
plt.show()

#### scatter plots showing relationship between columns for the third quarter

In [None]:
#scatter plot showing relationship between open and close prices
df_3.plot.scatter(x='Open',y='Close',c=('black'))
#obtaining the slop and intercept of regression line
x=df_3.Open
y=df_3.Close
m,b=np.polyfit(x,y,1)
#creating a regression line on the plot
plt.plot(x,m*x+b);

In [None]:
#scatter plot showing relationship between open and low prices
df_3.plot.scatter(x='Open',y='Low',c=('black'))
#obtaining the slop and intercept of regression line
x=df_3.Open
y=df_3.Low
m,b=np.polyfit(x,y,1)
#creating a regression line on the plot
plt.plot(x,m*x+b);

In [None]:
#scatter plot showing relationship between close and high prices
df_3.plot.scatter(x='Close',y='High',c=('black'))
#obtaining the slop and intercept of regression line
x=df_3.Close
y=df_3.High
m,b=np.polyfit(x,y,1)
#creating a regression line on the plot
plt.plot(x,m*x+b);

In [None]:
#scatter plot showing relationship between high and low prices
df_3.plot.scatter(x='High',y='Low',c=('black'))
#obtaining the slop and intercept of regression line
x=df_3.High
y=df_3.Low
m,b=np.polyfit(x,y,1)
#creating a regression line on the plot
plt.plot(x,m*x+b);

In [None]:
#scatter plot showing relationship between close and low prices
df_3.plot.scatter(x='Close',y='Low',c=('black'))
#obtaining the slop and intercept of regression line
x=df_3.Close
y=df_3.Low
m,b=np.polyfit(x,y,1)
#creating a regression line on the plot
plt.plot(x,m*x+b);

### 4th quarter

In [None]:
#loading data for 4th quarter
df_4=data["2019-10":"2019-12"]
df_4

In [None]:
#Plotting Candle chart for fourth quarter of 2019
mpf.plot(df_4,type='candle',mav=(50),
         title='NIO Price 2019_fourth quarter',volume=True,style='yahoo',
         figratio=(20,12),tight_layout=True)


In [None]:
#finding the correlation between the prices for the fourth quarter of 2019
fourth_quarter_corr=df_4.corr()
sns.heatmap(fourth_quarter_corr,annot=True,cmap="seismic",
           xticklabels=fourth_quarter_corr.columns.values,
           yticklabels=fourth_quarter_corr.columns.values)
plt.title('Prices correlation for the fourth quarter of 2019')
plt.show()
print(fourth_quarter_corr)

In [None]:
#checking out volume traded on each day of the week for the fourth quarter
order=['Monday','Tuesday','Wednesday','Thursday','Friday']
sns.set(rc={'figure.figsize':(15,7)})
sns.boxplot(data=df_4,x='Weekday',y='Volume',order=order);


#### checking for outliers in each column

In [None]:
#checking for outliers in the close column
sns.set(rc={'figure.figsize':(15,7)})
sns.boxplot(y=df_4['Close']);
plt.show()

In [None]:
#checking for outliers in the open column
sns.set(rc={'figure.figsize':(15,7)})
sns.boxplot(y=df_4['Open']);
plt.show()

In [None]:
#checking for outliers in the low column
sns.set(rc={'figure.figsize':(15,7)})
sns.boxplot(y=df_4['Low']);
plt.show()

In [None]:
#checking for outliers in the high column
sns.set(rc={'figure.figsize':(15,7)})
sns.boxplot(y=df_4['High']);
plt.show()

In [None]:
#checking for outliers in the volume column
sns.set(rc={'figure.figsize':(15,7)})
sns.boxplot(y=df_4['Volume']);
plt.show()

#### scatter plots showing relationship between columns

In [None]:
#scatter plot showing relationship between open and close prices
df_4.plot.scatter(x='Open',y='Close',c=('blue'))
#obtaining the slop and intercept of regression line
x=df_4.Open
y=df_4.Close
m,b=np.polyfit(x,y,1)
#creating a regression line on the plot
plt.plot(x,m*x+b);

In [None]:
#scatter plot showing relationship between open and low prices
df_4.plot.scatter(x='Open',y='Low',c=('blue'))
#obtaining the slop and intercept of regression line
x=df_4.Open
y=df_4.Low
m,b=np.polyfit(x,y,1)
#creating a regression line on the plot
plt.plot(x,m*x+b);

In [None]:
#scatter plot showing relationship between close and high prices
df_4.plot.scatter(x='Close',y='High',c=('blue'))
#obtaining the slop and intercept of regression line
x=df_4.Close
y=df_4.High
m,b=np.polyfit(x,y,1)
#creating a regression line on the plot
plt.plot(x,m*x+b);

In [None]:
#scatter plot showing relationship between high and low prices
df_4.plot.scatter(x='High',y='Low',c=('blue'))
#obtaining the slop and intercept of regression line
x=df_4.High
y=df_4.Low
m,b=np.polyfit(x,y,1)
#creating a regression line on the plot
plt.plot(x,m*x+b);

In [None]:
#scatter plot showing relationship between close and low prices
df_4.plot.scatter(x='Close',y='Low',c=('blue'))
#obtaining the slop and intercept of regression line
x=df_4.Close
y=df_4.Low
m,b=np.polyfit(x,y,1)
#creating a regression line on the plot
plt.plot(x,m*x+b);