# <font color = darkblue size=50px> Open Data Lab

# <font color = darkblue> Government College Kasaragod

## <font color=magenta> Session - 4 Introduction to Data Visualisation

---

In [None]:
# Importing necessary libraries

import numpy as np
import pandas as pd

#imporing visualisation libraries

import matplotlib.pyplot as plt
import seaborn as sns

## <font color = green> Line Plot
* Line chart represents the data in a series (in continuation) showing the frequency of data along with the number line.
* Used to compare numerical sets of values

In [None]:
x = [1, 2, 3, 4, 5]
y = x
plt.plot(x, y) #function to plot the graph
plt.show() #function to diaplay the graph

### <font color = green> Let's make it much easier to interpret

In [None]:
plt.figure(figsize=(10,6)) # declaring size of the image

x = [1, 2, 3, 4, 5]

y = x

plt.plot(x, y) #function to plot the graph

plt.xlabel('x-values') # labeling x-axis
plt.ylabel('y-values') # labeling y-axis

plt.title('Line plot') #giving a plot title

plt.grid() # to enable grid

plt.show() #function to diaplay the graph

#### To add more than one line to our plot

In [None]:
plt.figure(figsize=(10,6)) 

x = [1, 2, 3, 4, 5]

y = x
z = [4, 1.5, 4.5, 5, 3]

plt.plot(x, y, color='red', label='x=y') #color argument is used to specify the color of line
plt.plot(x, z, linestyle='dashed', label='trend') # line style is specified

plt.xlabel('x-values') 
plt.ylabel('y-values') 

plt.title('Multiple Line Plots', color='green', fontweight='bold', size=15) 

plt.grid() 
plt.legend()
plt.show()

### <font color = green> Subplots

For creating separate (multiple) plots in the same figure.<br>
<font color=magenta>plt.subplots(num_rows, num_cols, index_no.)

In [None]:
plt.figure(figsize=(15,5)) 

x = np.linspace(0,20,400)
y = np.sin(x)
z = np.cos(x)

plt.subplot(1,2,1)
plt.plot(x, y, color='red', label='sine') #color argument is used to specify the color of line
plt.xlabel('x') 
plt.ylabel('sin(x)') 
plt.title('sine function', color='green', fontweight='bold', size=12) 

plt.subplot(1,2,2)
plt.plot(x, z, linestyle='dashed', label='cosine') # line style is specified
plt.xlabel('x') 
plt.ylabel('cos(x)') 
plt.title('cosine function', color='green', fontweight='bold', size=12) 

plt.tight_layout #to adjust the subplot parameters automatically
plt.show()

## <font color = green> Bar Plot
* Used to compare numerical data over some categories/groups. 

In [None]:
# lets load a dataset firt
data = pd.read_csv('IRIS.csv')
data.head()

In [None]:
# lets plot a bar graph

plt.figure(figsize=(8,5))
n = data['species'].value_counts() # to find count of each unique category
species = n.index # to get the name of species
count = n.values # to get the corresponding frequency
plt.bar(species, count, color = 'lightgreen')
plt.xlabel('species')
plt.ylabel('count')
plt.title('IRIS species count')
plt.show()

In [None]:
# using seaborn

plt.figure(figsize=(8,5))
n = data['species'].value_counts() 
species = n.index 
count = n.values
sns.barplot(x = species, y=count)
plt.title('IRIS species count')
plt.show()

## <font color=green> Countplot
    Used to Show the counts of observations in each categorical bin using bars

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x=data['species'])
plt.title('IRIS species count')
plt.show()

## <font color = green> Histograms
    Used for the representation of frequency distribution (or we can say probability distribution) of the data. <br>
    we can define **bins** for the plot (i.e. breaking down the entire range of values into a series of intervals and calculating the count of values falling in each interval)

In [None]:
random = np.random.randn(1000) #Return a sample (or samples) from the “standard normal” distribution.
plt.figure(figsize=(8,5))
plt.hist(random, bins = 40, color='lightgreen')
plt.grid()
plt.xlabel('points')
plt.title("Histogram")
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.histplot(data=random, bins=40, kde=True)
plt.title("Histogram", fontsize=16)
plt.show()

In [None]:
# more attributes of Seaborn histogram
plt.figure(figsize=(8,5))
sns.histplot(data=data, x='petal_width', hue='species', kde=True, bins=30)
plt.title("Histogram", fontsize=16)
plt.show()

## <font color = green> Pie chart
    To compare numerical data against a category just like a bar plot  <br>
    Helps us to compare data as a fraction of the whole (percentages rather than raw numbers)

In [None]:
n = data['species'].value_counts()
species = n.index
count = n.values
colors= ['lightblue','lightgreen','gold']

plt.figure(figsize=(8,5))
plt.pie(count, labels=species,shadow=True,
        colors=colors, autopct='%1.2f%%')
plt.xlabel('species')
plt.show()

## <font color=green> Scatter Plots
    It is basically an X, Y coordinate plot i.e. between two numerical data columns

In [None]:
plt.figure(figsize=(8,5))
plt.scatter(data['petal_length'],data['petal_width'],c='r', s=10, marker='^')
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.title('Scatter plot on Iris dataset')
plt.show()

In [None]:
#using seaborn

plt.figure(figsize=(8,5))
sns.scatterplot(data=data, x='petal_length', y='petal_width', hue='species')
plt.show()

## <font color=green> Pair plot
* A unique kind of plot available in the seaborn library
* This plots a pairwise relationship in datasets (in a single figure)

In [None]:
sns.set_style("whitegrid");
sns.pairplot(data, hue="species", height=3);
plt.show()

## <font color = green> Box Plots
* Compare the values by plotting the distribution of data based on the sample minimum, the lower quartile, the median, the upper quartile, and the sample maximum
* Help us analyze the data to find the **outliers** and the variation in the data

In [None]:
l_w = data.drop('species', axis=1) #excluding species column
plt.figure(figsize=(12,8))
plt.boxplot(l_w, labels=l_w.columns)
plt.xlabel('Flower measurements', fontsize=15)
plt.ylabel('values', fontsize=15)
plt.title("Iris dataset analysis", color='red', fontsize=24)
plt.show()

In [None]:
plt.figure(figsize=(12,10))
sns.boxplot(data=l_w)
plt.xlabel('Flower measurements', fontsize=15)
plt.ylabel('values', fontsize=15)
plt.title("Iris dataset analysis", color='red', fontsize=24)
plt.show()

## <font color=green> Heat Map
    Used to represent categorical data in the form of ‘color-coded image plot’(values in the data are represented as colors) to find the correlation of the features in data (cluster analysis)

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(data.corr(), annot=True) #data.corr gives the correlation between features in the dataset. 
                                      #annot=True is used show annotation inside the matrix  
plt.show()

## <font color = green> To save a plot

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(data.corr(), annot=True) 
plt.savefig('sample.jpg')