# CSV

This notebook explores playing around csv files and csv functionalities.

In [2]:
%pwd

'/home/vivek/Documents/Projects/pandas'

In [3]:
import pandas as pd

In [6]:
iris = pd.read_csv('iris.csv', sep=',', decimal='.', header=None, names= ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'target'])

In [9]:
iris.shape

(150, 5)

In [10]:
iris.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'target'], dtype='object')

In [11]:
iris['target'].value_counts()

setosa        50
versicolor    50
virginica     50
Name: target, dtype: int64

## Let us try to read some dirty data

In [17]:
dirty = pd.read_csv('dirty.csv', sep=',')
dirty

Unnamed: 0,Date,Temperature_city_1,Temperature_city_2,Temperature_city_3,Unnamed: 4
0,Which_destination,,,,
1,20140910,80.0,32.0,40.0,1.0
2,20140911,100.0,50.0,36.0,2.0
3,20140912,102.0,55.0,46.0,1.0
4,20140912,60.0,20.0,35.0,3.0
5,20140914,60.0,,32.0,3.0
6,20140914,,57.0,42.0,2.0


In [19]:
# Lets fill the missing values with a value
dirty.fillna(dirty.mean(axis=0))
# This takes the mean of that column and fills it in that place

# NOTE: axis=0, means it is a column (basically span the rows of that column to get the average)
#       axis=1 means it is a row (basically span all the columns of that row to get the average)

Unnamed: 0,Date,Temperature_city_1,Temperature_city_2,Temperature_city_3,Unnamed: 4
0,Which_destination,80.4,42.8,38.5,2.0
1,20140910,80.0,32.0,40.0,1.0
2,20140911,100.0,50.0,36.0,2.0
3,20140912,102.0,55.0,46.0,1.0
4,20140912,60.0,20.0,35.0,3.0
5,20140914,60.0,42.8,32.0,3.0
6,20140914,80.4,57.0,42.0,2.0


**Note that if you want to ignore all the bad lines while using `read_csv` method, use the following method:**
*error_bad_lines=False* like so

```python
pd.read_csv("some_random_file.csv", error_bad_lines=False, sep=',')
```

## Chunks

In [34]:
movies = pd.read_csv('movies.csv', sep=',', chunksize=10)

In [35]:
for chunk in movies:
    print(chunk.shape)
    print(chunk)

(10, 4)
   Unnamed: 0  movie_id                               title  \
0           0         1                    Toy Story (1995)   
1           1         2                      Jumanji (1995)   
2           2         3             Grumpier Old Men (1995)   
3           3         4            Waiting to Exhale (1995)   
4           4         5  Father of the Bride Part II (1995)   
5           5         6                         Heat (1995)   
6           6         7                      Sabrina (1995)   
7           7         8                 Tom and Huck (1995)   
8           8         9                 Sudden Death (1995)   
9           9        10                    GoldenEye (1995)   

                         genres  
0   Animation|Children's|Comedy  
1  Adventure|Children's|Fantasy  
2                Comedy|Romance  
3                  Comedy|Drama  
4                        Comedy  
5         Action|Crime|Thriller  
6                Comedy|Romance  
7          Adventure|Child

In [36]:
# You can load the file with an inbuilt iterator, instead of looping and viewing every chunk, like so.
iter = pd.read_csv('movies.csv', sep=',', iterator=True)

In [39]:
# And then dynamically get only a limited value set from the iterator like so...
iter.get_chunk(10)

Unnamed: 0.1,Unnamed: 0,movie_id,title,genres
0,0,1,Toy Story (1995),Animation|Children's|Comedy
1,1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,2,3,Grumpier Old Men (1995),Comedy|Romance
3,3,4,Waiting to Exhale (1995),Comedy|Drama
4,4,5,Father of the Bride Part II (1995),Comedy
5,5,6,Heat (1995),Action|Crime|Thriller
6,6,7,Sabrina (1995),Comedy|Romance
7,7,8,Tom and Huck (1995),Adventure|Children's
8,8,9,Sudden Death (1995),Action
9,9,10,GoldenEye (1995),Action|Adventure|Thriller


In [42]:
chunk20 = iter.get_chunk(20)
chunk20

Unnamed: 0.1,Unnamed: 0,movie_id,title,genres
0,50,51,Guardian Angel (1994),Action|Drama|Thriller
1,51,52,Mighty Aphrodite (1995),Comedy
2,52,53,Lamerica (1994),Drama
3,53,54,"Big Green, The (1995)",Children's|Comedy
4,54,55,Georgia (1995),Drama
5,55,56,Kids of the Round Table (1995),Adventure|Children's|Fantasy
6,56,57,Home for the Holidays (1995),Drama
7,57,58,"Postino, Il (The Postman) (1994)",Drama|Romance
8,58,59,"Confessional, The (Le Confessionnal) (1995)",Drama|Mystery
9,59,60,"Indian in the Cupboard, The (1995)",Adventure|Children's|Fantasy


In [46]:
my_own_dataset = pd.DataFrame({'Col1': range(8), 'Col2': [1.0]*8, 'Col3': 1.0, 'Col4': 'Hello World!'})

In [47]:
my_own_dataset

Unnamed: 0,Col1,Col2,Col3,Col4
0,0,1.0,1.0,Hello World!
1,1,1.0,1.0,Hello World!
2,2,1.0,1.0,Hello World!
3,3,1.0,1.0,Hello World!
4,4,1.0,1.0,Hello World!
5,5,1.0,1.0,Hello World!
6,6,1.0,1.0,Hello World!
7,7,1.0,1.0,Hello World!


In [52]:
movies = pd.read_csv('movies.csv', sep=',')

In [54]:
movies.dtypes

Unnamed: 0     int64
movie_id       int64
title         object
genres        object
dtype: object

In [58]:
my_own_dataset['Col2'] = my_own_dataset['Col2'].astype('int')

In [59]:
my_own_dataset

Unnamed: 0,Col1,Col2,Col3,Col4
0,0,1,1.0,Hello World!
1,1,1,1.0,Hello World!
2,2,1,1.0,Hello World!
3,3,1,1.0,Hello World!
4,4,1,1.0,Hello World!
5,5,1,1.0,Hello World!
6,6,1,1.0,Hello World!
7,7,1,1.0,Hello World!


In [76]:
# Let us try to rename movie titles which just has "Comedy" as a genre type,
filter_comedy = movies['genres'] == 'Comedy'
# to "Funny"
movies.loc[filter_comedy, 'genres'] = 'Funny'

In [82]:
movies.groupby(['genres']).count()

Unnamed: 0_level_0,Unnamed: 0,movie_id,title
genres,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Action,65,65,65
Action|Adventure,25,25,25
Action|Adventure|Animation,1,1,1
Action|Adventure|Animation|Children's|Fantasy,1,1,1
Action|Adventure|Animation|Horror|Sci-Fi,1,1,1
Action|Adventure|Children's,1,1,1
Action|Adventure|Children's|Comedy,2,2,2
Action|Adventure|Children's|Fantasy,1,1,1
Action|Adventure|Children's|Sci-Fi,1,1,1
Action|Adventure|Comedy,5,5,5


In [83]:
movies[movies['genres'] == 'Action|Adventure|Comedy|Horror|Sci-Fi']

Unnamed: 0.1,Unnamed: 0,movie_id,title,genres
1197,1197,1215,Army of Darkness (1993),Action|Adventure|Comedy|Horror|Sci-Fi


In [85]:
movies.sort_values(by='title').head()

Unnamed: 0.1,Unnamed: 0,movie_id,title,genres
1962,1962,2031,"$1,000,000 Duck (1971)",Children's|Comedy
3043,3043,3112,'Night Mother (1986),Drama
769,769,779,'Til There Was You (1997),Drama|Romance
2003,2003,2072,"'burbs, The (1989)",Funny
3351,3351,3420,...And Justice for All (1979),Drama|Thriller


In [89]:
movies[['title', 'genres']][150:170]

Unnamed: 0,title,genres
150,"Addiction, The (1995)",Horror
151,Batman Forever (1995),Action|Adventure|Comedy|Crime
152,Belle de jour (1967),Drama
153,Beyond Rangoon (1995),Drama|War
154,Blue in the Face (1995),Funny
155,Canadian Bacon (1994),Comedy|War
156,Casper (1995),Adventure|Children's
157,Clockers (1995),Drama
158,Congo (1995),Action|Adventure|Mystery|Sci-Fi
159,Crimson Tide (1995),Drama|Thriller|War


In [98]:
movies.ix[124, ['title', 'genres']]

title     NeverEnding Story III, The (1994)
genres         Adventure|Children's|Fantasy
Name: 124, dtype: object

In [99]:
categories = pd.Series(['Action', 'Adventure', 'Mystery', 'Sci-Fi', 'Crime', 'Film-Noir', 'Mystery', 'Drama', 'Thriller', 'War'])

In [102]:
mapping = pd.get_dummies(categories)
mapping

Unnamed: 0,Action,Adventure,Crime,Drama,Film-Noir,Mystery,Sci-Fi,Thriller,War
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [105]:
mapping['Thriller']

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
5    0.0
6    0.0
7    0.0
8    1.0
9    0.0
Name: Thriller, dtype: float64

In [109]:
import numpy as np

In [111]:
arr_1 = np.array([8, 5, 3, 7])

In [114]:
arr_1.dtype

dtype('int64')

In [115]:
arr_1.nbytes

32