In [1]:
import pandas as pd
pd.Series?

pd.Series([1, 2, 3, None])

In [2]:
pd.Series(['sai', 'prasanna', None])

0         sai
1    prasanna
2        None
dtype: object

In [3]:
import numpy as np

In [4]:
np.nan == False

False

In [5]:
np.nan == None

False

In [6]:
np.nan == np.nan

False

In [7]:
np.isnan(np.nan)

True

In [8]:
sports = {
        'Archery' : 'Bhutan', 
         'Cricket' : 'India'
         }
pd.Series(sports)

Archery    Bhutan
Cricket     India
dtype: object

In [9]:
pd.Series(['Archery', 'Cricket'], index = ['Bhutan', 'India'])

Bhutan    Archery
India     Cricket
dtype: object

# Querying a series

In [10]:
sports = {
        'Archery' : 'Bhutan', 
        'Cricket' : 'India',
        'Football' : 'Spain',
         'Chess'  : 'France'
        }
a = pd.Series(sports)

In [11]:
a

Archery     Bhutan
Cricket      India
Football     Spain
Chess       France
dtype: object

In [12]:
a.iloc[2]

'Spain'

In [13]:
a.loc['Chess']

'France'

In [14]:
a[1]

'India'

In [15]:
a['Cricket']

'India'

Pandas treat as iloc or loc if you don't specify and use indexing which leads to confusion. So it is better to use with iloc or loc than indexing.

In [16]:
s = pd.Series([100, 120, 3, 101], index = [1, 2, 3, 4])

In [17]:
s

1    100
2    120
3      3
4    101
dtype: int64

In [18]:
s[3]

3

In [19]:
s[2]

120

As we can observe here that here s[0] gives us an error because it checks loc as main and doesn't check iloc when the index values are numbers

In [20]:
s[0]

KeyError: 0

In [21]:
s = pd.Series([100.0, 120.0, 101.0, 3.0])
s

0    100.0
1    120.0
2    101.0
3      3.0
dtype: float64

In [22]:
total = 0
for item in s:
    total += item
total

324.0

In [23]:
total = np.sum(s)
total

324.0

In [24]:
s = pd.Series(np.random.randint(90, 1000, 10000))

In [25]:
s.head()

0    190
1    332
2    172
3     97
4    291
dtype: int32

In [26]:
len(s)

10000

In [27]:
%%timeit -n 100
total = 0
for item in s:
    total += item
total


5.65 ms ± 508 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [28]:
%%timeit -n 100
total = np.sum(s)

557 µs ± 49.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [29]:
%%timeit -n 10
s = pd.Series(np.random.randint(0, 1000, 10000))
for label, value in s.iteritems():
    s.loc[label] = value + 2

2.82 s ± 141 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [30]:
%%timeit -n 10
s = pd.Series(np.random.randint(0, 1000, 10000))
s += 2

The slowest run took 4.70 times longer than the fastest. This could mean that an intermediate result is being cached.
1.35 ms ± 1.11 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [31]:
a = pd.Series([1, 2, 3])
a.loc['Animal'] = 'Tiger'
a

0             1
1             2
2             3
Animal    Tiger
dtype: object

In [32]:
z = pd.Series(['Australia', 'India', 'England'], index = ['Cricket', 'Cricket', 'Cricket'])

In [33]:
z

Cricket    Australia
Cricket        India
Cricket      England
dtype: object

In [34]:
z['Cricket']

Cricket    Australia
Cricket        India
Cricket      England
dtype: object

In [35]:
z[0]

'Australia'

In [36]:
z[1]

'India'

# The DataFrame Data Structure

In [37]:
p1 = pd.Series({'Name' : 'Prasanna', 'Item Purchased': 'Rice', 'Cost':  25.00})
p2 = pd.Series({'Name' : 'Sriram', 'Item Purchased': 'Wheat', 'Cost':  30.00})
p3 = pd.Series({'Name' : 'Paresh', 'Item Purchased': 'Biryani', 'Cost':  85.00})
df = pd.DataFrame([p1, p2, p3], index = ['Store 1', 'Store 1', 'Store 2'])
df.head()

Unnamed: 0,Name,Item Purchased,Cost
Store 1,Prasanna,Rice,25.0
Store 1,Sriram,Wheat,30.0
Store 2,Paresh,Biryani,85.0


In [38]:
df.loc['Store 2']

Name               Paresh
Item Purchased    Biryani
Cost                   85
Name: Store 2, dtype: object

In [39]:
type(df.loc['Store 2'])

pandas.core.series.Series

In [40]:
df.loc['Store 1']

Unnamed: 0,Name,Item Purchased,Cost
Store 1,Prasanna,Rice,25.0
Store 1,Sriram,Wheat,30.0


In [41]:
df['Item Purchased']

Store 1       Rice
Store 1      Wheat
Store 2    Biryani
Name: Item Purchased, dtype: object

In [42]:
df.loc['Store 1', 'Cost']

Store 1    25.0
Store 1    30.0
Name: Cost, dtype: float64

# Transpose of a dataframe

In [43]:
df.T

Unnamed: 0,Store 1,Store 1.1,Store 2
Name,Prasanna,Sriram,Paresh
Item Purchased,Rice,Wheat,Biryani
Cost,25,30,85


In [44]:
df.T.loc['Cost']

Store 1    25
Store 1    30
Store 2    85
Name: Cost, dtype: object

In [45]:
df.drop('Store 1')

Unnamed: 0,Name,Item Purchased,Cost
Store 2,Paresh,Biryani,85.0


# Drop

In [46]:
df

Unnamed: 0,Name,Item Purchased,Cost
Store 1,Prasanna,Rice,25.0
Store 1,Sriram,Wheat,30.0
Store 2,Paresh,Biryani,85.0


In [47]:
copy_df = df.copy()
copy_df = copy_df.drop('Store 1')
copy_df

Unnamed: 0,Name,Item Purchased,Cost
Store 2,Paresh,Biryani,85.0


In [48]:
del copy_df['Name']
copy_df

Unnamed: 0,Item Purchased,Cost
Store 2,Biryani,85.0


In [49]:
df['Location'] = None
df

Unnamed: 0,Name,Item Purchased,Cost,Location
Store 1,Prasanna,Rice,25.0,
Store 1,Sriram,Wheat,30.0,
Store 2,Paresh,Biryani,85.0,


# DataFrame indexing and loading

In [50]:
costs = df['Cost']

In [51]:
costs

Store 1    25.0
Store 1    30.0
Store 2    85.0
Name: Cost, dtype: float64

In [52]:
costs += 2
costs

Store 1    27.0
Store 1    32.0
Store 2    87.0
Name: Cost, dtype: float64

In [53]:
df

Unnamed: 0,Name,Item Purchased,Cost,Location
Store 1,Prasanna,Rice,27.0,
Store 1,Sriram,Wheat,32.0,
Store 2,Paresh,Biryani,87.0,


SyntaxError: invalid syntax (<ipython-input-56-7c0d88a2ec43>, line 1)

In [None]:
df = pd.read_csv('Filename.csv')
df.head()

In [None]:
df = pd.read_csv('Filename.csv', index_col = 0, skip_rows = 1)
df.head()

In [None]:
df.columns

In [None]:
for col in df.columns:
    if col[:2] == '01':
        df.rename(columns=={col:'Gold' + col[4:]}, inplace = True)
    if col[:2] == '02':
        df.rename(columns=={col:'Silver' + col[4:]}, inplace = True)
    if col[:2] == '03':
        df.rename(columns=={col:'Bronze' + col[4:]}, inplace = True)
    if col[:1] == '#':
        df.rename(columns=={col:'#' + col[4:]}, inplace = True)
df.head()

In [None]:
df['Gold'] > 0

In [None]:
only_gold =df.where(df['Gold'] > 0)
only_gold.head()

In [None]:
only_gold['Gold'].count()

In [None]:
df['Gold'].count()

In [None]:
only_gold.dropna()

In [None]:
only_gold.head()

In [None]:
len(df[(df['Gold'] > 0) | df['Gold.1'] > 0])

In [None]:
df[(df['Gold.1'] > 0) & (df['Gold'] == 0)]