## Introduction to Data Analysis

In [1]:
!ls 

Affairs.csv
Data Analysis with Python.ipynb
Learning Python for Data Science-Examples and Stuff.ipynb
Learning Python for Data Science.ipynb
Statistics with Python.ipynb
abalone.data
abalone.names
affair data desc.txt
madlibs.py
train.csv


# Sourcing Data

In [2]:
import pandas as pd # import libraries
import numpy as np 

In [3]:
abalone = pd.read_csv('abalone.data', header=None)

In [4]:
abalone.head() # Read/loaded the dataset into a dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [5]:
# Name the abalone col
abalone.columns = [
    'Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight',
    'Viscera weight', 'Shell weight', 'Rings'
]

In [6]:
abalone.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


## Sourcing Data from Kaggle

In [7]:
titanic_data = pd.read_csv('train.csv') # Loaded the dataframe into this variable

In [8]:
titanic_data.shape # Looking at the dimension of the dataframe 

(891, 12)

In [9]:
titanic_data.columns # Attribute for reading column names

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [10]:
titanic_data.head() # First five rows

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [11]:
titanic_data.info() # Summary of the dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


## Pandas Data Structure

In [12]:
# At the core: Two data structures 
# DataFrame - 2D - Rows and columns 
# Series - 1D - Columns 

In [13]:
titanic_data.head() # DataFrame

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [14]:
titanic_data.shape
# Axis 0 = Rows, Axis 1 = Columns

(891, 12)

In [15]:
type(titanic_data)

pandas.core.frame.DataFrame

In [16]:
titanic_data['Sex'].head() 

0      male
1    female
2    female
3    female
4      male
Name: Sex, dtype: object

In [17]:
series_titanic = titanic_data['Sex']

In [18]:
series_titanic.shape

(891,)

## Analysis of Extra-marital affairs

In [None]:
# Sourcing data 
# https://vincentarelbundock.github.io/Rdatasets/datasets.html

In [19]:
import pandas as pd # Data Manipulation
import numpy as np # Numerical Analysis
import seaborn as sns # Charts & Graphs
import statistics as st # Statistics 

In [20]:
data_affairs = pd.read_csv('Affairs.csv') # Read into a dataframe 

In [22]:
data_affairs.shape # Rows & Columns

(601, 10)

In [23]:
data_affairs.head() # First 5 rows 

Unnamed: 0.1,Unnamed: 0,affairs,gender,age,yearsmarried,children,religiousness,education,occupation,rating
0,4,0,male,37.0,10.0,no,3,18,7,4
1,5,0,female,27.0,4.0,no,4,14,6,4
2,11,0,female,32.0,15.0,yes,1,12,1,4
3,16,0,male,57.0,15.0,yes,5,18,6,5
4,23,0,male,22.0,0.75,no,2,17,6,3


In [24]:
data_affairs.info() # Conscise Summary

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 601 entries, 0 to 600
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     601 non-null    int64  
 1   affairs        601 non-null    int64  
 2   gender         601 non-null    object 
 3   age            601 non-null    float64
 4   yearsmarried   601 non-null    float64
 5   children       601 non-null    object 
 6   religiousness  601 non-null    int64  
 7   education      601 non-null    int64  
 8   occupation     601 non-null    int64  
 9   rating         601 non-null    int64  
dtypes: float64(2), int64(6), object(2)
memory usage: 47.1+ KB


In [26]:
data_affairs['Unnamed: 0'].head(10)  

0     4
1     5
2    11
3    16
4    23
5    29
6    44
7    45
8    47
9    49
Name: Unnamed: 0, dtype: int64

In [27]:
data_affairs_1 = data_affairs.rename(columns={'Unnamed: 0': 'ID'}) # Renaming the 'Unnamed: 0' column

In [28]:
data_affairs_1.head() 

Unnamed: 0,ID,affairs,gender,age,yearsmarried,children,religiousness,education,occupation,rating
0,4,0,male,37.0,10.0,no,3,18,7,4
1,5,0,female,27.0,4.0,no,4,14,6,4
2,11,0,female,32.0,15.0,yes,1,12,1,4
3,16,0,male,57.0,15.0,yes,5,18,6,5
4,23,0,male,22.0,0.75,no,2,17,6,3


In [29]:
data_affairs_1.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 601 entries, 0 to 600
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID             601 non-null    int64  
 1   affairs        601 non-null    int64  
 2   gender         601 non-null    object 
 3   age            601 non-null    float64
 4   yearsmarried   601 non-null    float64
 5   children       601 non-null    object 
 6   religiousness  601 non-null    int64  
 7   education      601 non-null    int64  
 8   occupation     601 non-null    int64  
 9   rating         601 non-null    int64  
dtypes: float64(2), int64(6), object(2)
memory usage: 47.1+ KB
