# Load CSV Dataset

In [12]:
# import libraries
import pandas as pd

# open a local csv file

In [3]:
# Read the dataset
df = pd.read_csv('../Dataset/babar-t20i-stats.csv')
df.head()

Unnamed: 0,Runs,Min,BF,4s,6s,SR,Pos,Dismissal,Inns,Opposition,Ground,Start Date
0,15*,13,11,2,0,136.36,3,not out,2,v England,Manchester,7 Sep 2016
1,55*,49,37,6,2,148.64,3,not out,2,v West Indies,Dubai (DICS),23 Sep 2016
2,19,-,18,2,0,105.55,3,caught,1,v West Indies,Dubai (DICS),24 Sep 2016
3,27*,42,24,1,0,112.5,3,not out,2,v West Indies,Abu Dhabi,27 Sep 2016
4,29,-,30,3,0,96.66,3,caught,2,v West Indies,Bridgetown,26 Mar 2017


# Opening csv from URL

In [4]:
# Import the 'requests' library to fetch data from web URLs.
import requests
# Import 'StringIO' to treat strings as in-memory text files.
from io import StringIO

# Initialize an empty string variable to store the URL.
url= ''

In [5]:
# Read only the "Dismissal" and "Runs" columns from the CSV into a DataFrame.
df = pd.read_csv('../Dataset/babar-t20i-stats.csv', usecols=["Dismissal", "Runs"])
# Show the first 5 rows to check the data.
df.head()

Unnamed: 0,Runs,Dismissal
0,15*,not out
1,55*,not out
2,19,caught
3,27*,not out
4,29,caught


In [6]:
# Read only the first 100 rows from the CSV file into a DataFrame.
df = pd.read_csv('../Dataset/babar-t20i-stats.csv', nrows=100)

# Show the first 5 rows of the loaded data.
df.head()

Unnamed: 0,Runs,Min,BF,4s,6s,SR,Pos,Dismissal,Inns,Opposition,Ground,Start Date
0,15*,13,11,2,0,136.36,3,not out,2,v England,Manchester,7 Sep 2016
1,55*,49,37,6,2,148.64,3,not out,2,v West Indies,Dubai (DICS),23 Sep 2016
2,19,-,18,2,0,105.55,3,caught,1,v West Indies,Dubai (DICS),24 Sep 2016
3,27*,42,24,1,0,112.5,3,not out,2,v West Indies,Abu Dhabi,27 Sep 2016
4,29,-,30,3,0,96.66,3,caught,2,v West Indies,Bridgetown,26 Mar 2017


In [7]:
# Read the CSV, treating the first row as data (header=None) and assigning custom column names.
df = pd.read_csv('../Dataset/babar-t20i-stats.csv', names=["Runs", "6s", "Opposition"], header=None)

# Show the first 5 rows with the new custom headers.
df.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Runs,6s,Opposition
Runs,Min,BF,4s,6s,SR,Pos,Dismissal,Inns,Opposition,Ground,Start Date
15*,13,11,2,0,136.36,3,not out,2,v England,Manchester,7 Sep 2016
55*,49,37,6,2,148.64,3,not out,2,v West Indies,Dubai (DICS),23 Sep 2016
19,-,18,2,0,105.55,3,caught,1,v West Indies,Dubai (DICS),24 Sep 2016
27*,42,24,1,0,112.50,3,not out,2,v West Indies,Abu Dhabi,27 Sep 2016


In [8]:
# Count missing (NaN/null) values in each column of the DataFrame.
df.isnull().sum()

Runs          0
6s            0
Opposition    0
dtype: int64

In [9]:
# Read the CSV, interpreting strings "?" and "missing" as NaN (null) values.
df = pd.read_csv('../Dataset/babar-t20i-stats.csv', na_values=["?", "missing"])

# Show the first 5 rows to check the data.
df.head()

Unnamed: 0,Runs,Min,BF,4s,6s,SR,Pos,Dismissal,Inns,Opposition,Ground,Start Date
0,15*,13,11,2,0,136.36,3,not out,2,v England,Manchester,7 Sep 2016
1,55*,49,37,6,2,148.64,3,not out,2,v West Indies,Dubai (DICS),23 Sep 2016
2,19,-,18,2,0,105.55,3,caught,1,v West Indies,Dubai (DICS),24 Sep 2016
3,27*,42,24,1,0,112.5,3,not out,2,v West Indies,Abu Dhabi,27 Sep 2016
4,29,-,30,3,0,96.66,3,caught,2,v West Indies,Bridgetown,26 Mar 2017


In [10]:
# Process the CSV file in 10-row 'chunks' to avoid loading the entire file into memory.
for chunk in pd.read_csv('../Dataset/babar-t20i-stats.csv', chunksize=10):
    # Print the first 5 rows of the current chunk.
    print(chunk.head())

  Runs Min  BF  4s  6s      SR  Pos Dismissal  Inns     Opposition  \
0  15*  13  11   2   0  136.36    3   not out     2      v England   
1  55*  49  37   6   2  148.64    3   not out     2  v West Indies   
2   19   -  18   2   0  105.55    3    caught     1  v West Indies   
3  27*  42  24   1   0  112.50    3   not out     2  v West Indies   
4   29   -  30   3   0   96.66    3    caught     2  v West Indies   

         Ground   Start Date  
0    Manchester   7 Sep 2016  
1  Dubai (DICS)  23 Sep 2016  
2  Dubai (DICS)  24 Sep 2016  
3     Abu Dhabi  27 Sep 2016  
4    Bridgetown  26 Mar 2017  
   Runs Min  BF  4s  6s      SR  Pos Dismissal  Inns     Opposition  \
10   48   -  31   5   0  154.83    3    caught     1     v World-XI   
11    1   -   8   0   0   12.50    3   stumped     2    v Sri Lanka   
12    1   -   2   0   0   50.00    3       lbw     2    v Sri Lanka   
13  34*   -  31   2   0  109.67    3   not out     1    v Sri Lanka   
14   41  68  41   1   1  100.00    5  

In [11]:
# Read the entire CSV file into a DataFrame named 'df'.
df = pd.read_csv('../Dataset/babar-t20i-stats.csv')

# Show the first 5 rows of the DataFrame.
df.head()

Unnamed: 0,Runs,Min,BF,4s,6s,SR,Pos,Dismissal,Inns,Opposition,Ground,Start Date
0,15*,13,11,2,0,136.36,3,not out,2,v England,Manchester,7 Sep 2016
1,55*,49,37,6,2,148.64,3,not out,2,v West Indies,Dubai (DICS),23 Sep 2016
2,19,-,18,2,0,105.55,3,caught,1,v West Indies,Dubai (DICS),24 Sep 2016
3,27*,42,24,1,0,112.5,3,not out,2,v West Indies,Abu Dhabi,27 Sep 2016
4,29,-,30,3,0,96.66,3,caught,2,v West Indies,Bridgetown,26 Mar 2017
