# DataFrames

DataFrames are the workhorse of pandas and are directly inspired by the R programming language. We can think of a DataFrame as a bunch of Series objects put together to share the same index. Let's use pandas to explore this topic!

In [0]:
import pandas as pd
import numpy as np

In [0]:
from numpy.random import randn
np.random.seed(101)

In [3]:
randn(5,4)

array([[ 2.70684984,  0.62813271,  0.90796945,  0.50382575],
       [ 0.65111795, -0.31931804, -0.84807698,  0.60596535],
       [-2.01816824,  0.74012206,  0.52881349, -0.58900053],
       [ 0.18869531, -0.75887206, -0.93323722,  0.95505651],
       [ 0.19079432,  1.97875732,  2.60596728,  0.68350889]])

In [4]:
'A B C D E'.split()

['A', 'B', 'C', 'D', 'E']

In [5]:
'W X Y Z'.split()

['W', 'X', 'Y', 'Z']

In [0]:
df = pd.DataFrame(randn(5,4),index='A B C D E'.split(),columns='W X Y Z'.split())

In [7]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [0]:
df_from_csv = pd.read_csv("/content/mpg.csv")

In [68]:
df_from_csv.head()

Unnamed: 0.1,Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
0,1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
1,2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
2,3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
3,4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
4,5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


In [71]:
df_from_csv.dtypes

Unnamed: 0        int64
manufacturer     object
model            object
displ           float64
year              int64
cyl               int64
trans            object
drv              object
cty               int64
hwy               int64
fl               object
class            object
dtype: object

In [69]:
df_from_csv['manufacturer']

0            audi
1            audi
2            audi
3            audi
4            audi
          ...    
229    volkswagen
230    volkswagen
231    volkswagen
232    volkswagen
233    volkswagen
Name: manufacturer, Length: 234, dtype: object

In [70]:
df_from_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234 entries, 0 to 233
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    234 non-null    int64  
 1   manufacturer  234 non-null    object 
 2   model         234 non-null    object 
 3   displ         234 non-null    float64
 4   year          234 non-null    int64  
 5   cyl           234 non-null    int64  
 6   trans         234 non-null    object 
 7   drv           234 non-null    object 
 8   cty           234 non-null    int64  
 9   hwy           234 non-null    int64  
 10  fl            234 non-null    object 
 11  class         234 non-null    object 
dtypes: float64(1), int64(5), object(6)
memory usage: 22.1+ KB


# Constructing DataFrame objects


A Pandas DataFrame can be constructed in a variety of ways. Here we’ll give several
examples.

# 1 . From a single Series object.

A DataFrame is a collection of Series objects, and a single-
column DataFrame can be constructed from a single Series :

In [0]:
# area_dict and population_dict are dictionaries

area_dict = {'California': 423967, 
             'Texas': 695662,
             'New York': 141297,
             'Florida': 170312,
             'Illinois': 149995}

population_dict = {'California': 38332521,
              'Texas': 26448193,
              'New York': 19651127,
              'Florida': 19552860,
              'Illinois': 12882135}

In [13]:
area_series = pd.Series(area_dict)

population_series = pd.Series(population_dict)

print( " ********* Area Series ********** ")
print(area_series)

print("\n")

print( " ********* Population Series ********** ")
print(population_series)


 ********* Area Series ********** 
California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64


 ********* Population Series ********** 
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64


In [14]:
#states_df is a dataframes
states_df = pd.DataFrame({'area': area_series})
states_df

Unnamed: 0,area
California,423967
Texas,695662
New York,141297
Florida,170312
Illinois,149995


# 2. From a dictionary of Series objects.

A DataFrame can be constructed from a dictionary of Series objects as well

In [15]:
states_df = pd.DataFrame({'area': area_series, 'population' : population_series})
states_df

Unnamed: 0,area,population
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


#3. From a list of dicts.

Any list of dictionaries can be made into a DataFrame . We’ll use a
simple list comprehension to create some data:

In [16]:
list_data = [{'a': i, 'b': 2 * i} for i in range(0,5)]
list_data

[{'a': 0, 'b': 0},
 {'a': 1, 'b': 2},
 {'a': 2, 'b': 4},
 {'a': 3, 'b': 6},
 {'a': 4, 'b': 8}]

In [17]:
pd.DataFrame(list_data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4
3,3,6
4,4,8


Even if some keys in the dictionary are missing, Pandas will fill them in with NaN (i.e.,
“not a number”) values:

In [18]:
pd.DataFrame([{'a': 1, 'b': 2},
              {'b': 3, 'c': 4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


#4. From a two-dimensional NumPy array

Given a two-dimensional array of data, we can
create a DataFrame with any specified column and index names. If omitted, an integer
index will be used for each:

In [19]:
pd.DataFrame(np.random.rand(3, 2),
columns=['Column 1', 'Column 2'],
index=['a', 'b', 'c'])

Unnamed: 0,Column 1,Column 2
a,0.043397,0.223937
b,0.575205,0.120434
c,0.500117,0.13801


# 5. From a NumPy structured array.

A Pandas DataFrame operates much like a
structured array, and can be created directly from one:

In [20]:
zeros_array = np.zeros(3)
zeros_array

array([0., 0., 0.])

In [21]:
zeros_structured_array = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
zeros_structured_array

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])

In [22]:
pd.DataFrame(zeros_structured_array)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


#Dataframe Attributes

##Index

In [26]:
states_df

Unnamed: 0,area,population
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


Like the Series object, the DataFrame has an index attribute that gives access to the
index labels:

In [24]:
states_df.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

##Columns

Additionally, the DataFrame has a columns attribute, which is an Index object holding
the column labels:

In [27]:
states_df.columns

Index(['area', 'population'], dtype='object')

## dtypes




In [28]:
states_df.dtypes

area          int64
population    int64
dtype: object

In [29]:
df = pd.DataFrame({'float_column': [1.0, 2.0,3.0],

                   'int_column': [1, 2, 3],

                   'datetime_column': [pd.Timestamp('20180310') , pd.Timestamp('20180310' ), pd.Timestamp('20180310')],

                   'string_column': ['Anna' , 'Bob' , 'Cario']})

df

Unnamed: 0,float_column,int_column,datetime_column,string_column
0,1.0,1,2018-03-10,Anna
1,2.0,2,2018-03-10,Bob
2,3.0,3,2018-03-10,Cario


In [30]:
df.dtypes

float_column              float64
int_column                  int64
datetime_column    datetime64[ns]
string_column              object
dtype: object

## select_dtypes() 

This is a method of dataframe object, but i have mentioned it over here to distinguish the dtypes and select_dtypes


In [31]:
df = pd.DataFrame({'column 1': [1, 2] * 3,
                   'column 2': [True, False] * 3,
                   'column 3': [1.0, 2.0] * 3,
                   'column 4': [4, 5] * 3,})

df

Unnamed: 0,column 1,column 2,column 3,column 4
0,1,True,1.0,4
1,2,False,2.0,5
2,1,True,1.0,4
3,2,False,2.0,5
4,1,True,1.0,4
5,2,False,2.0,5


In [32]:
df.select_dtypes(include='int')

Unnamed: 0,column 1,column 4
0,1,4
1,2,5
2,1,4
3,2,5
4,1,4
5,2,5


In [33]:
df.select_dtypes(include='bool')

Unnamed: 0,column 2
0,True
1,False
2,True
3,False
4,True
5,False


In [34]:
df.select_dtypes(include='float')

Unnamed: 0,column 3
0,1.0
1,2.0
2,1.0
3,2.0
4,1.0
5,2.0


## values



In [35]:
df = pd.DataFrame({'age':    [ 3,  29],

                   'height': [94, 170],

                   'weight': [31, 115]})
df

Unnamed: 0,age,height,weight
0,3,94,31
1,29,170,115


In [36]:
df.values

array([[  3,  94,  31],
       [ 29, 170, 115]])

## Alternative to values - to_numpy (Recommended)


In [37]:
df.to_numpy()

array([[  3,  94,  31],
       [ 29, 170, 115]])

In [40]:
tuples_enclosed_in_list_mixed_dtype = [('parrot',   24.0, 'second'),
                           ('lion',     80.5, 1),
                           ('monkey', np.nan, None)]
                           
df2 = pd.DataFrame( tuples_enclosed_in_list_mixed_dtype, columns=('name', 'max_speed', 'rank'))
df2

Unnamed: 0,name,max_speed,rank
0,parrot,24.0,second
1,lion,80.5,1
2,monkey,,


In [42]:
df2.to_array() # will cause error

AttributeError: ignored

A DataFrame with mixed type columns(e.g., str/object, int64, float32) results in an ndarray of the broadest type that accommodates these mixed types (e.g., object).

In [43]:
df2.values

array([['parrot', 24.0, 'second'],
       ['lion', 80.5, 1],
       ['monkey', nan, None]], dtype=object)

## size

In [44]:
df

Unnamed: 0,age,height,weight
0,3,94,31
1,29,170,115


In [46]:
df.size

6

##shape

In [47]:
df.shape

(2, 3)

In [48]:
print("Number of rows = " , df.shape[0])
print("Number of columns = " , df.shape[1])

Number of rows =  2
Number of columns =  3


# T 

Return the transpose, which is by definition self.

In [49]:
df.T

Unnamed: 0,0,1
age,3,29
height,94,170
weight,31,115


In [50]:
df.T.loc['age']

0     3
1    29
Name: age, dtype: int64

## Methods of dataframe

## head()

In [51]:
df = pd.DataFrame({
                   'col1':[1,2,3,4,5,6,7,8,9,10],
                   'col2':[111,222,333,444,555,666,777,888,999,1000],
                   'col3':['abc','def','ghi','jkl', 'mno', 'pqr', 'stu','vwx','yz', 'abc'],
                   })
df

Unnamed: 0,col1,col2,col3
0,1,111,abc
1,2,222,def
2,3,333,ghi
3,4,444,jkl
4,5,555,mno
5,6,666,pqr
6,7,777,stu
7,8,888,vwx
8,9,999,yz
9,10,1000,abc


In [52]:
df.head()

Unnamed: 0,col1,col2,col3
0,1,111,abc
1,2,222,def
2,3,333,ghi
3,4,444,jkl
4,5,555,mno


In [54]:
df.head(7)

Unnamed: 0,col1,col2,col3
0,1,111,abc
1,2,222,def
2,3,333,ghi
3,4,444,jkl
4,5,555,mno
5,6,666,pqr
6,7,777,stu


##tail()

In [55]:
df.tail()

Unnamed: 0,col1,col2,col3
5,6,666,pqr
6,7,777,stu
7,8,888,vwx
8,9,999,yz
9,10,1000,abc


In [56]:
df.tail(7)

Unnamed: 0,col1,col2,col3
3,4,444,jkl
4,5,555,mno
5,6,666,pqr
6,7,777,stu
7,8,888,vwx
8,9,999,yz
9,10,1000,abc


##unique()

In [57]:
df['col3'].unique()

array(['abc', 'def', 'ghi', 'jkl', 'mno', 'pqr', 'stu', 'vwx', 'yz'],
      dtype=object)

##nunique()

In [58]:
df['col3'].nunique()

9

##value_counts()

In [61]:
df['col3'].value_counts()

abc    2
pqr    1
mno    1
def    1
ghi    1
vwx    1
jkl    1
yz     1
stu    1
Name: col3, dtype: int64

In [62]:
# Repetition # But notice the difference
#value - attribute
#count_values - method

df['col3'].values

array(['abc', 'def', 'ghi', 'jkl', 'mno', 'pqr', 'stu', 'vwx', 'yz',
       'abc'], dtype=object)

## sorting

In [63]:
df

Unnamed: 0,col1,col2,col3
0,1,111,abc
1,2,222,def
2,3,333,ghi
3,4,444,jkl
4,5,555,mno
5,6,666,pqr
6,7,777,stu
7,8,888,vwx
8,9,999,yz
9,10,1000,abc


In [65]:
df.sort_values(by='col3') #inplace=False by default

#df.sort_values(by='col3' , inplace= True) 

Unnamed: 0,col1,col2,col3
0,1,111,abc
9,10,1000,abc
1,2,222,def
2,3,333,ghi
3,4,444,jkl
4,5,555,mno
5,6,666,pqr
6,7,777,stu
7,8,888,vwx
8,9,999,yz


In [66]:
df

Unnamed: 0,col1,col2,col3
0,1,111,abc
1,2,222,def
2,3,333,ghi
3,4,444,jkl
4,5,555,mno
5,6,666,pqr
6,7,777,stu
7,8,888,vwx
8,9,999,yz
9,10,1000,abc


# Null Values or Check for Null Values

    isnull()
    notnull()
    fillna()

    and many more....

Will discuss in the Handling Missing Data Notebook file