In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px 
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## There are 7 steps in Data Science lifecycle
1. Business Understanding
2. Data mining
3. Data cleaning
4. Data Exploration
5. Feather Engineering
6. Predictive Modeling
7. Data visualization


# 1.Business Understanding

This dataset contains data of petrol consumption, prices, share percentage of countries. Visualize the data and predict price of petrol.

# 2.Data mining

In [2]:
df=pd.read_csv('/kaggle/input/petrolgas-prices-worldwide/Petrol Dataset June 20 2022.csv',encoding='latin-1')

In [3]:
df.head()

Check the number of rows and columns, data type of each column values.

In [4]:
df.info()

In [5]:
df.shape

In [6]:
df.columns

# 3.Data cleaning

In [7]:
df.head()

We can drop first column, it only contains serial numbers. It will not have significant effect on modeling or visualization.

In [8]:
df.columns

In [9]:
df.drop('#',inplace =True , axis=1)

In [10]:
df.head()

Rename column **World Share** to **World Share %**

In [11]:
df.rename(columns={'World Share': 'World Share %'},inplace=True)

In [12]:
df.head()

Convert the data type of **World share** column to **float64**

#The **World share %** is not exactly accurate therefore we recalculate the World share percentage


In [13]:
#The World share % is not exactly accurate therefor we recalculate the World share percentage
Total_Consumption=df['Daily Oil Consumption (Barrels)'].sum()
df['World Share %']=(df['Daily Oil Consumption (Barrels)']/Total_Consumption)*100

In [14]:
df.head()

Check for any null cells

In [15]:
df.isnull().sum()

# 4.Data Exploration

Statistical details of the dataset

In [16]:
df.describe(include='all')

In [17]:
df.columns

In [18]:
print('Country with lowest Daily Oil Consumption (Barrels):',df['Daily Oil Consumption (Barrels)'].min())
print(df[df['Daily Oil Consumption (Barrels)']==df['Daily Oil Consumption (Barrels)'].min()]['Country'])

print('\n\nCountry with lowest World Share % :',df['World Share %'].min())
print(df[df['World Share %']==df['World Share %'].min()]['Country'])

print('\n\nCountry with lowest Yearly Gallons Per Capita :',df['Yearly Gallons Per Capita'].min())
print(df[df['Yearly Gallons Per Capita']==df['Yearly Gallons Per Capita'].min()]['Country'])

print('\n\nCountry with lowest Price Per Gallon (USD) :',df['Price Per Gallon (USD)'].min())
print(df[df['Price Per Gallon (USD)']==df['Price Per Gallon (USD)'].min()]['Country'])

print('\n\nCountry with lowest Price Per Liter (USD) :',df['Price Per Liter (USD)'].min())
print(df[df['Price Per Liter (USD)']==df['Price Per Liter (USD)'].min()]['Country'])

In [19]:
print('Country with highest Daily Oil Consumption (Barrels):',df['Daily Oil Consumption (Barrels)'].max())
print(df[df['Daily Oil Consumption (Barrels)']==df['Daily Oil Consumption (Barrels)'].max()]['Country'])

print('\n\nCountry with highest World Share % :',df['World Share %'].max())
print(df[df['World Share %']==df['World Share %'].max()]['Country'])

print('\n\nCountry with highest Yearly Gallons Per Capita :',df['Yearly Gallons Per Capita'].max())
print(df[df['Yearly Gallons Per Capita']==df['Yearly Gallons Per Capita'].max()]['Country'])

print('\n\nCountry with highest Price Per Gallon (USD) :',df['Price Per Gallon (USD)'].max())
print(df[df['Price Per Gallon (USD)']==df['Price Per Gallon (USD)'].max()]['Country'])

print('\n\nCountry with highest Price Per Liter (USD) :',df['Price Per Liter (USD)'].max())
print(df[df['Price Per Liter (USD)']==df['Price Per Liter (USD)'].max()]['Country'])

# 5.Feather Engineering

Check the correlation between columns

In [20]:
df.corr()

Plot heatmap of correlation between columns 

In [21]:
plt.figure(figsize=(20,12))
sns.heatmap(df.corr(),annot=True,cmap='coolwarm')


In [22]:
sns.pairplot(df)

# 6.Predictive Model

Linear Regression Model

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
df.columns

In [25]:
X=df[['Daily Oil Consumption (Barrels)', 'World Share %','Yearly Gallons Per Capita']]

In [26]:
y=df['Price Per Liter (USD)']

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

In [28]:
from sklearn.linear_model import LinearRegression

In [29]:
lm=LinearRegression()

In [30]:
lm.fit(X_train,y_train)

In [31]:
prediction=lm.predict(X_test)

In [32]:
prediction

In [33]:
plt.scatter(prediction,y_test)

In [34]:
plt.figure(figsize=(20,12))
sns.histplot(y_test-prediction,bins=100)

In [35]:
accu_sc=round(lm.score(X_train,y_train)*100,2)

In [36]:
accu_sc

In [37]:
from sklearn import metrics

In [38]:
metrics.mean_absolute_error(prediction,y_test)

In [39]:
metrics.mean_squared_error(prediction,y_test)

In [40]:
np.sqrt(metrics.mean_squared_error(prediction,y_test))

# 7.Data Visualization

In [41]:
#plt.figure(figsize=(18,12))
#sns.boxplot(df['Price Per Liter (USD)'])

fig=px.box(df['Price Per Liter (USD)'],height=600,width=800,template='simple_white')
fig.update_layout(title={'text':'Box plot of Price Per Liter (USD) of petrol world wide'},title_x=0.5)
fig.show()

In [42]:
fig=px.histogram(df,x='Price Per Liter (USD)',nbins=100,template='simple_white',height=500,width=800)
fig.update_layout(title={'text':'Histogram of Price Per Liter (USD) of petrol world wide'},title_x=0.5)
fig.show()

In [43]:
top_10_countries = df.nlargest(10, 'Daily Oil Consumption (Barrels)')
top_10_countries.head(10)

In [44]:
fig=px.bar(top_10_countries, x='Daily Oil Consumption (Barrels)',y='Country',height=300,width=800,template='simple_white')
fig.update_layout(title={'text':'Daily Oil Consumption (Barrels) - Top 10 Countries'},title_x=0.5)
fig.show()

In [45]:
top_10_countries = df.nlargest(10, 'World Share %')
fig=px.bar(top_10_countries, x='World Share %',y='Country',height=300,width=800,template='simple_white')
fig.update_layout(title={'text':'World Share % of petrol - Top 10 Countries'},title_x=0.5)
fig.show()

In [46]:
top_10_countries = df.nlargest(10, 'Yearly Gallons Per Capita')
fig=px.bar(top_10_countries, x='Yearly Gallons Per Capita',y='Country',height=300,width=800,template='simple_white')
fig.update_layout(title={'text':'Yearly Gallons Per Capita - Top 10 Countries'},title_x=0.5)
fig.show()

In [47]:
top_10_countries = df.nlargest(10, 'Price Per Liter (USD)')
fig=px.bar(top_10_countries, x='Price Per Liter (USD)',y='Country',height=300,width=800,template='simple_white')
fig.update_layout(title={'text':'Bar plot of Price Per Liter (USD) of petrol'},title_x=0.5)
fig.show()

In [48]:
df_Pak=df[df['Country']=='Pakistan']
df_Ind=df[df['Country']=='India']
df_Ban=df[df['Country']=='Bangladesh']


In [49]:
df_Com=pd.concat([df_Pak,df_Ind,df_Ban])

In [50]:
fig=px.bar(df_Com, x='Daily Oil Consumption (Barrels)',y='Country',height=300,width=800,template='simple_white')
fig.update_layout(title={'text':'Daily Oil Consumption (Barrels)'},title_x=0.5)
fig.show()

In [51]:
fig=px.bar(df_Com, x='World Share %',y='Country',height=300,width=800,template='simple_white')
fig.update_layout(title={'text':'World Share % of petrol'},title_x=0.5)
fig.show()

In [52]:
fig=px.bar(df_Com, x='Yearly Gallons Per Capita',y='Country',height=300,width=800,template='simple_white')
fig.update_layout(title={'text':'Yearly Gallons Per Capita'},title_x=0.5)
fig.show()

In [53]:
fig=px.bar(df_Com, x='Price Per Liter (USD)',y='Country',height=300,width=800,template='simple_white')
fig.update_layout(title={'text':'Bar plot of Price Per Liter (USD) of petrol'},title_x=0.5)
fig.show()

In [54]:
print('Thanks')