In [1]:
import pandas as pd
import numpy as np

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
s1 = pd.Series((1, 'IACSD', 5, np.nan, 6, 8))
s1

0        1
1    IACSD
2        5
3      NaN
4        6
5        8
dtype: object

In [4]:
#Creating a DataFrame by passing a NumPy array, 
#with a datetime index using date_range() 
#and labeled columns:

dates = pd.date_range("20230315", 
                      periods=6)

dates

DatetimeIndex(['2023-03-15', '2023-03-16', '2023-03-17', '2023-03-18',
               '2023-03-19', '2023-03-20'],
              dtype='datetime64[ns]', freq='D')

In [7]:
from numpy.random import default_rng
rng = default_rng()
    
df = pd.DataFrame(rng.standard_normal((6, 4)), 
                  index=dates, 
                  columns=list("ABCD"))

df

Unnamed: 0,A,B,C,D
2023-03-15,0.861831,0.762996,-0.352265,-0.380021
2023-03-16,-0.419372,-0.931079,0.59073,-1.391824
2023-03-17,-0.659521,0.53238,-0.056196,1.449236
2023-03-18,0.380019,0.692852,-1.613918,-1.147825
2023-03-19,-0.492103,0.166803,-0.784841,-1.972621
2023-03-20,0.638982,1.203661,1.033805,0.010193


In [21]:
# crate dataframe using list of list
data = [[1,2,3],[4,5,6],[7,8,9,4],[10,11,12]]
df = pd.DataFrame(data)
print(df)
df = pd.DataFrame(data,columns =['c1','c2','c3','c4'])
print(df)
df = pd.DataFrame(data,index = [101,102,103,104],columns =['c1','c2','c3','c4'])
print(df)
print(df.dtypes)

    0   1   2    3
0   1   2   3  NaN
1   4   5   6  NaN
2   7   8   9  4.0
3  10  11  12  NaN
   c1  c2  c3   c4
0   1   2   3  NaN
1   4   5   6  NaN
2   7   8   9  4.0
3  10  11  12  NaN
     c1  c2  c3   c4
101   1   2   3  NaN
102   4   5   6  NaN
103   7   8   9  4.0
104  10  11  12  NaN
c1      int64
c2      int64
c3      int64
c4    float64
dtype: object


In [17]:
#Creating a DataFrame by passing a dictionary of objects 
#that can be converted into a series-like structure:

df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20220102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)

print(df2)

Unnamed: 0,A,B,C,D,E,F
0,1.0,2022-01-02,1.0,3,test,foo
1,1.0,2022-01-02,1.0,3,train,foo
2,1.0,2022-01-02,1.0,3,test,foo
3,1.0,2022-01-02,1.0,3,train,foo


In [19]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [18]:
s = pd.Series(1,range(4))
s

0    1
1    1
2    1
3    1
dtype: int64

In [24]:
# head
# get first 5 rows from dataframe
# if n is passed then first n rows
print(df.head())

#tail
# get last 5 rows
# if n is passed then last n rows
df.tail(3)

                   A         B         C         D
2023-03-15  0.769873 -0.301536 -0.532692  1.231501
2023-03-16  1.384759  0.991228 -0.352336  1.878296
2023-03-17  0.466206 -1.014490  0.791180  0.279244
2023-03-18  0.618649  2.886395 -0.873613 -1.099681
2023-03-19  1.623741 -0.401910  0.708176  1.072964


Unnamed: 0,A,B,C,D
2023-03-18,0.618649,2.886395,-0.873613,-1.099681
2023-03-19,1.623741,-0.40191,0.708176,1.072964
2023-03-20,-0.498708,-1.135717,0.026655,-1.164514


In [25]:
df.index

DatetimeIndex(['2023-03-15', '2023-03-16', '2023-03-17', '2023-03-18',
               '2023-03-19', '2023-03-20'],
              dtype='datetime64[ns]', freq='D')

In [28]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [None]:
# Data frame to numpy array
"""
DataFrame.to_numpy() gives a NumPy representation 
of the underlying data. 

DataFrame.to_numpy() does not include 
the index or column labels in the output.

Note that this can be an expensive operation 
when your DataFrame has columns with different data types, 
which comes down to a fundamental difference between 
pandas and NumPy
NumPy arrays have one dtype for the entire array, 
while pandas DataFrames have one dtype per column. 
When you call DataFrame.to_numpy(), pandas will find 
the NumPy dtype that can hold all of the dtypes 
in the DataFrame. This may end up being object, 
which requires casting every value to a Python object.
"""
"""
For DataFrame of all floating-point values, 
DataFrame.to_numpy() is fast  
Also it doesn’t require copying data:

In [34]:

print(df)
df.to_numpy()

                   A         B         C         D
2023-03-15  0.769873 -0.301536 -0.532692  1.231501
2023-03-16  1.384759  0.991228 -0.352336  1.878296
2023-03-17  0.466206 -1.014490  0.791180  0.279244
2023-03-18  0.618649  2.886395 -0.873613 -1.099681
2023-03-19  1.623741 -0.401910  0.708176  1.072964
2023-03-20 -0.498708 -1.135717  0.026655 -1.164514


array([[ 0.7698729 , -0.30153562, -0.53269233,  1.23150079],
       [ 1.38475926,  0.99122827, -0.35233566,  1.87829645],
       [ 0.46620584, -1.01449034,  0.79117995,  0.27924392],
       [ 0.6186492 ,  2.88639486, -0.87361308, -1.09968064],
       [ 1.62374092, -0.40190976,  0.70817608,  1.07296406],
       [-0.49870772, -1.13571671,  0.02665502, -1.16451364]])

In [35]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.72742,0.170662,-0.038772,0.366302
std,0.751098,1.530396,0.676797,1.267608
min,-0.498708,-1.135717,-0.873613,-1.164514
25%,0.504317,-0.861345,-0.487603,-0.75495
50%,0.694261,-0.351723,-0.16284,0.676104
75%,1.231038,0.668037,0.537796,1.191867
max,1.623741,2.886395,0.79118,1.878296


In [36]:
# Transpose of DataFrame
# Here index and column names are swapped
df.T

Unnamed: 0,2023-03-15,2023-03-16,2023-03-17,2023-03-18,2023-03-19,2023-03-20
A,0.769873,1.384759,0.466206,0.618649,1.623741,-0.498708
B,-0.301536,0.991228,-1.01449,2.886395,-0.40191,-1.135717
C,-0.532692,-0.352336,0.79118,-0.873613,0.708176,0.026655
D,1.231501,1.878296,0.279244,-1.099681,1.072964,-1.164514


In [44]:
# Sort by an axis
# axis 0 is row direction
# axis 1 is column direction

print(df.sort_index(axis=1, ascending=False))
df.sort_index(axis=0, ascending=False)

                   D         C         B         A
2023-03-15  1.231501 -0.532692 -0.301536  0.769873
2023-03-16  1.878296 -0.352336  0.991228  1.384759
2023-03-17  0.279244  0.791180 -1.014490  0.466206
2023-03-18 -1.099681 -0.873613  2.886395  0.618649
2023-03-19  1.072964  0.708176 -0.401910  1.623741
2023-03-20 -1.164514  0.026655 -1.135717 -0.498708


Unnamed: 0,A,B,C,D
2023-03-20,-0.498708,-1.135717,0.026655,-1.164514
2023-03-19,1.623741,-0.40191,0.708176,1.072964
2023-03-18,0.618649,2.886395,-0.873613,-1.099681
2023-03-17,0.466206,-1.01449,0.79118,0.279244
2023-03-16,1.384759,0.991228,-0.352336,1.878296
2023-03-15,0.769873,-0.301536,-0.532692,1.231501


In [46]:
# Sort by values in given column 
#or list of columns
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2023-03-20,-0.498708,-1.135717,0.026655,-1.164514
2023-03-17,0.466206,-1.01449,0.79118,0.279244
2023-03-19,1.623741,-0.40191,0.708176,1.072964
2023-03-15,0.769873,-0.301536,-0.532692,1.231501
2023-03-16,1.384759,0.991228,-0.352336,1.878296
2023-03-18,0.618649,2.886395,-0.873613,-1.099681


In [None]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20220102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)

print(df2)

Create a data frame from dictionary
column names - 3 module names
row labeles - roll no
Enter the data in following order
roll no sql python AA
101
109
102
125
110

In [3]:
d = {
    "Python": [98,87,57,89,70],
    "Sql": [90,67,89,87,56],
    "AA" : [70,67,99,78,77]
}
Rollno = [101,109,102,125,110]
dd = pd.DataFrame(d,index = Rollno)
print(dd)

     Python  Sql  AA
101      98   90  70
109      87   67  67
102      57   89  99
125      89   87  78
110      70   56  77


Sort all colums

In [4]:
dd.sort_index(axis =1)

Unnamed: 0,AA,Python,Sql
101,70,98,90
109,67,87,67
102,99,57,89
125,78,89,87
110,77,70,56


In [5]:
dd.sort_index(axis =0)

Unnamed: 0,Python,Sql,AA
101,98,90,70
102,57,89,99
109,87,67,67
110,70,56,77
125,89,87,78


In [6]:
dd.sort_values("Python", ascending = False)

Unnamed: 0,Python,Sql,AA
101,98,90,70
125,89,87,78
109,87,67,67
110,70,56,77
102,57,89,99


In [7]:
dd["Python"]

101    98
109    87
102    57
125    89
110    70
Name: Python, dtype: int64

In [8]:
dd.iloc[[0]]

Unnamed: 0,Python,Sql,AA
101,98,90,70


In [9]:
dd.iloc[:3]

Unnamed: 0,Python,Sql,AA
101,98,90,70
109,87,67,67
102,57,89,99


In [10]:
dd.iloc[:2]

Unnamed: 0,Python,Sql,AA
101,98,90,70
109,87,67,67


In [11]:
dd.iloc[:,1:3]

Unnamed: 0,Sql,AA
101,90,70
109,67,67
102,89,99
125,87,78
110,56,77


In [12]:
dd.iloc[2:4,1:3]

Unnamed: 0,Sql,AA
102,89,99
125,87,78


In [13]:
dd.loc[[102]]

Unnamed: 0,Python,Sql,AA
102,57,89,99


In [14]:
dd.loc[:,'AA']

101    70
109    67
102    99
125    78
110    77
Name: AA, dtype: int64

In [15]:
dd.loc[101:102,['AA','Sql']]

Unnamed: 0,AA,Sql
101,70,90
109,67,67
102,99,89


In [16]:
dd.loc[1:3,['AA','Sql']]

KeyError: 1

In [17]:
dd.loc[101:102]

Unnamed: 0,Python,Sql,AA
101,98,90,70
109,87,67,67
102,57,89,99


In [18]:
dd.loc[101,'AA']

70

## at
returns a single value based on label

## iat
returns a single value based on index

In [19]:
dd.at[101,"Python"]

98

In [21]:
dd.iat[0,1]

90

In [26]:
dd[dd.Sql== 90]

Unnamed: 0,Python,Sql,AA
101,98,90,70


## print the record of last three students using iloc, loc

In [32]:
dd.iloc[-3:]

Unnamed: 0,Python,Sql,AA
102,57,89,99
125,89,87,78
110,70,56,77


In [30]:
dd.loc[102:]

Unnamed: 0,Python,Sql,AA
102,57,89,99
125,89,87,78
110,70,56,77


In [31]:
dd.iloc[2:5]

Unnamed: 0,Python,Sql,AA
102,57,89,99
125,89,87,78
110,70,56,77


In [33]:
dd

Unnamed: 0,Python,Sql,AA
101,98,90,70
109,87,67,67
102,57,89,99
125,89,87,78
110,70,56,77


In [35]:
dd.loc[[101,110],'Python']

101    98
110    70
Name: Python, dtype: int64

In [42]:
dd.iloc[[0,-1]]

Unnamed: 0,Python,Sql,AA
101,98,90,70
110,70,56,77


## Print row of student who is 3rd highest

In [45]:
d1 = dd.sort_values('AA',ascending = False)
print(d1)
d1.iloc[[2]]

[     Python  Sql  AA
102      57   89  99
125      89   87  78
110      70   56  77
101      98   90  70
109      87   67  67]


Unnamed: 0,Python,Sql,AA
110,70,56,77


In [44]:
dd.sort_values('AA',ascending = False).iloc[[2]]

Unnamed: 0,Python,Sql,AA
110,70,56,77


In [50]:
dates = pd.date_range("20230315", 
                      periods=6)

dates

DatetimeIndex(['2023-03-15', '2023-03-16', '2023-03-17', '2023-03-18',
               '2023-03-19', '2023-03-20'],
              dtype='datetime64[ns]', freq='D')

In [51]:
from numpy.random import default_rng
rng = default_rng()
    
df = pd.DataFrame(rng.standard_normal((6, 4)), 
                  index=dates, 
                  columns=list("ABCD"))

df

Unnamed: 0,A,B,C,D
2023-03-15,-0.091235,0.275372,1.107693,-1.939522
2023-03-16,0.562316,-1.061591,0.346527,-1.230909
2023-03-17,1.117288,1.075569,-0.604605,0.723308
2023-03-18,-0.060737,-1.76734,-0.638104,-0.566394
2023-03-19,0.427254,-2.667644,-0.51209,-0.608257
2023-03-20,0.065347,0.791332,-1.081979,0.130476


## Boolean indexing

In [52]:
print(df>0)

                A      B      C      D
2023-03-15  False   True   True  False
2023-03-16   True  False   True  False
2023-03-17   True   True  False   True
2023-03-18  False  False  False  False
2023-03-19   True  False  False  False
2023-03-20   True   True  False   True


In [54]:
df[df > 0]

Unnamed: 0,A,B,C,D
2023-03-15,,0.275372,1.107693,
2023-03-16,0.562316,,0.346527,
2023-03-17,1.117288,1.075569,,0.723308
2023-03-18,,,,
2023-03-19,0.427254,,,
2023-03-20,0.065347,0.791332,,0.130476


In [55]:
df[df > 1]

Unnamed: 0,A,B,C,D
2023-03-15,,,1.107693,
2023-03-16,,,,
2023-03-17,1.117288,1.075569,,
2023-03-18,,,,
2023-03-19,,,,
2023-03-20,,,,


In [56]:
dd

Unnamed: 0,Python,Sql,AA
101,98,90,70
109,87,67,67
102,57,89,99
125,89,87,78
110,70,56,77


In [64]:
dd[(dd['Sql']) > 80 & (dd['AA'] >70)]

Unnamed: 0,Python,Sql,AA
101,98,90,70
109,87,67,67
102,57,89,99
125,89,87,78
110,70,56,77


In [70]:
dd.loc[(dd['Sql'] > 85) & (dd['AA'] > 75),['Sql','AA']]

Unnamed: 0,Sql,AA
102,89,99
125,87,78


Select all students who have received more thabn 23 marks in  python

In [66]:
dd.loc[dd['Python'] > 85]

Unnamed: 0,Python,Sql,AA
101,98,90,70
109,87,67,67
125,89,87,78


Select all students where Sql Marks > 29 and AA > 20 marks in  python

In [71]:
dd.loc[(dd['Sql'] > 85) & (dd['AA'] > 75),['Sql','AA']]

Unnamed: 0,Sql,AA
102,89,99
125,87,78


## isin

In [77]:
dd[dd['Python'].isin([98,89])]

Unnamed: 0,Python,Sql,AA
101,98,90,70
125,89,87,78


In [81]:
dd.mean(include = ['int32'])

TypeError: mean() got an unexpected keyword argument 'include'

In [5]:
from numpy.random import default_rng
rng = default_rng()

df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))

print(df)
print("############")

df.at['2023-03-15', "A"] = 0

print(df)

#using index -> first row and third column
df.iat[0, 2] = 0

df

# Using Numpy array
df.loc[:, "D"] = np.array([5] * len(df))

df

# Update using condition
df2 = df.copy()
df2['E']=10
df2[df2 <= 0] = -df2

df2

# Mean median Mode of All columns

df.mean(axis=1) #comulm

df.median()

# Mode
# Mode may return multiple values
df1 = pd.DataFrame()
df1['cat1'] = ['A','B','B','B','C','D']
df1['cat2'] = [1,1,2,2,3,4]
df1['rno'] = [1,2,3,4,5,6]

print(df1.mode())

                   A         B         C         D
2023-03-15 -0.010346 -1.188263  0.042752  0.501029
2023-03-16  0.240217  0.377011 -0.299032  0.468358
2023-03-17  0.948838 -0.013062 -0.761862  2.396917
2023-03-18  0.847724  1.498921  0.327841 -0.774557
2023-03-19 -0.838875 -0.124079  1.305956 -1.653886
2023-03-20 -0.482128  0.443511 -1.042789 -0.283459
############
                   A         B         C         D
2023-03-15  0.000000 -1.188263  0.042752  0.501029
2023-03-16  0.240217  0.377011 -0.299032  0.468358
2023-03-17  0.948838 -0.013062 -0.761862  2.396917
2023-03-18  0.847724  1.498921  0.327841 -0.774557
2023-03-19 -0.838875 -0.124079  1.305956 -1.653886
2023-03-20 -0.482128  0.443511 -1.042789 -0.283459
  cat1  cat2  rno
0    B   1.0    1
1  NaN   2.0    2
2  NaN   NaN    3
3  NaN   NaN    4
4  NaN   NaN    5
5  NaN   NaN    6


NameError: name 'Print' is not defined

In [6]:
df1 = pd.DataFrame({'ID': [1, 2, 3, 4, 5, 6],
                    'Age': [12, 34, 56, 45, 33, 12], 
                    'City': ['Pune', 'Pune', 'Nagpur', 'Pune', 'Bombay', 'Nagpur']})
df2 = pd.DataFrame({'ID': [1, 2, 3, 4, 5, 6],
                    'UNID': [11123, 11125, 11134, 11156, 11190, 11166], 
                    'Name': ['Raj', 'Smita', 'Nayan', 'Sanchi', 'Swaroop', 'Harshita']})
df3 = pd.DataFrame({'ID': [1, 2, 3, 4, 5, 6],
                    'Gender': ['M', 'F', 'M', 'F', 'M', 'F'], 
                    'Mobile': ['9822123456', '9822123444', '9822456777', '9822123999', '9822456123', '9822456000']})

In [7]:
df1

Unnamed: 0,ID,Age,City
0,1,12,Pune
1,2,34,Pune
2,3,56,Nagpur
3,4,45,Pune
4,5,33,Bombay
5,6,12,Nagpur


In [8]:
df2

Unnamed: 0,ID,UNID,Name
0,1,11123,Raj
1,2,11125,Smita
2,3,11134,Nayan
3,4,11156,Sanchi
4,5,11190,Swaroop
5,6,11166,Harshita


In [9]:
df3

Unnamed: 0,ID,Gender,Mobile
0,1,M,9822123456
1,2,F,9822123444
2,3,M,9822456777
3,4,F,9822123999
4,5,M,9822456123
5,6,F,9822456000


In [10]:
result = pd.concat([df1, df2], axis=1)
result

Unnamed: 0,ID,Age,City,ID.1,UNID,Name
0,1,12,Pune,1,11123,Raj
1,2,34,Pune,2,11125,Smita
2,3,56,Nagpur,3,11134,Nayan
3,4,45,Pune,4,11156,Sanchi
4,5,33,Bombay,5,11190,Swaroop
5,6,12,Nagpur,6,11166,Harshita


In [11]:
result = pd.merge(df1, df2, on='ID')
result

Unnamed: 0,ID,Age,City,UNID,Name
0,1,12,Pune,11123,Raj
1,2,34,Pune,11125,Smita
2,3,56,Nagpur,11134,Nayan
3,4,45,Pune,11156,Sanchi
4,5,33,Bombay,11190,Swaroop
5,6,12,Nagpur,11166,Harshita


In [24]:
result.groupby('City')[['Age']].sum()


Unnamed: 0_level_0,Age
City,Unnamed: 1_level_1
Bombay,33
Nagpur,68
Pune,91


In [25]:
result.groupby('City')[['Age']]
r.groups
r.get_group(('Pune'))

Unnamed: 0,ID,Age,City,UNID,Name
0,1,12,Pune,11123,Raj
1,2,34,Pune,11125,Smita
3,4,45,Pune,11156,Sanchi


In [28]:
result[(result.City == 'Pune') & (result['Age'] > 20)]

Unnamed: 0,ID,Age,City,UNID,Name
1,2,34,Pune,11125,Smita
3,4,45,Pune,11156,Sanchi


In [30]:
s1 = pd.Series([1,2,5,7,3,5,3,8,9,7])
for i in s1:
    if(i <4):
        print(i)

1
2
3
3


## DATASET

In [35]:
df = pd.read_csv("MarketArrivals.csv")
df

Unnamed: 0,market,month,year,quantity,priceMin,priceMax,priceMod,state,city,date
0,ABOHAR(PB),January,2005,2350,404,493,446,PB,ABOHAR,January-2005
1,ABOHAR(PB),January,2006,900,487,638,563,PB,ABOHAR,January-2006
2,ABOHAR(PB),January,2010,790,1283,1592,1460,PB,ABOHAR,January-2010
3,ABOHAR(PB),January,2011,245,3067,3750,3433,PB,ABOHAR,January-2011
4,ABOHAR(PB),January,2012,1035,523,686,605,PB,ABOHAR,January-2012
...,...,...,...,...,...,...,...,...,...,...
10222,YEOLA(MS),December,2011,131326,282,612,526,MS,YEOLA,December-2011
10223,YEOLA(MS),December,2012,207066,485,1327,1136,MS,YEOLA,December-2012
10224,YEOLA(MS),December,2013,215883,472,1427,1177,MS,YEOLA,December-2013
10225,YEOLA(MS),December,2014,201077,446,1654,1456,MS,YEOLA,December-2014


In [39]:
df.shape

(10227, 10)

In [44]:
list(df.columns)

['market',
 'month',
 'year',
 'quantity',
 'priceMin',
 'priceMax',
 'priceMod',
 'state',
 'city',
 'date']

In [46]:
df.dtypes

market      object
month       object
year         int64
quantity     int64
priceMin     int64
priceMax     int64
priceMod     int64
state       object
city        object
date        object
dtype: object

In [52]:
df.market.unique()

array(['ABOHAR(PB)', 'AGRA(UP)', 'AHMEDABAD(GUJ)', 'AHMEDNAGAR(MS)',
       'AJMER(RAJ)', 'ALIGARH(UP)', 'ALWAR(RAJ)', 'AMRITSAR(PB)',
       'BALLIA(UP)', 'BANGALORE', 'BAREILLY(UP)', 'BELGAUM(KNT)',
       'BHATINDA(PB)', 'BHAVNAGAR(GUJ)', 'BHOPAL', 'BHUBNESWER(OR)',
       'BIHARSHARIF(BHR)', 'BIJAPUR(KNT)', 'BIKANER(RAJ)', 'BOMBORI(MS)',
       'BURDWAN(WB)', 'CHAKAN(MS)', 'CHALLAKERE(KNT)', 'CHANDIGARH',
       'CHANDVAD(MS)', 'CHENNAI', 'CHICKBALLAPUR(KNT)',
       'COIMBATORE(TN) (bellary)', 'COIMBATORE(TN) (podisu)',
       'DEESA(GUJ)', 'DEHRADOON(UTT)', 'DELHI', 'DEORIA(UP)',
       'DEVALA(MS)', 'DEWAS(MP)', 'DHAVANGERE(KNT)', 'DHULIA(MS)',
       'DINDIGUL(TN)', 'DINDIGUL(TN)(Podis', 'DINDORI(MS)', 'ETAWAH(UP)',
       'GONDAL(GUJ)', 'GORAKHPUR(UP)', 'GUWAHATI', 'HALDWANI(UTT)',
       'HASSAN(KNT)', 'HOSHIARPUR(PB)', 'HUBLI(KNT)', 'HYDERABAD',
       'INDORE(MP)', 'JAIPUR', 'JALANDHAR(PB)', 'JALGAON(MS)',
       'JALGAON(WHITE)', 'JAMMU', 'JAMNAGAR(GUJ)', 'JODHPUR(RAJ)',
 

In [53]:
df.market.nunique()

120

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10227 entries, 0 to 10226
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   market    10227 non-null  object
 1   month     10227 non-null  object
 2   year      10227 non-null  int64 
 3   quantity  10227 non-null  int64 
 4   priceMin  10227 non-null  int64 
 5   priceMax  10227 non-null  int64 
 6   priceMod  10227 non-null  int64 
 7   state     10227 non-null  object
 8   city      10227 non-null  object
 9   date      10227 non-null  object
dtypes: int64(5), object(5)
memory usage: 799.1+ KB


In [49]:
df.describe()

Unnamed: 0,year,quantity,priceMin,priceMax,priceMod
count,10227.0,10227.0,10227.0,10227.0,10227.0
mean,2009.022294,76604.88,646.944363,1212.760731,984.284345
std,4.372841,124408.7,673.12185,979.658874,818.471498
min,1996.0,20.0,16.0,145.0,80.0
25%,2006.0,8898.0,209.0,557.0,448.0
50%,2009.0,27460.0,440.0,923.0,747.0
75%,2013.0,88356.5,828.0,1527.0,1248.0
max,2016.0,1639032.0,6000.0,8192.0,6400.0


In [50]:
df.describe(include="object")

Unnamed: 0,market,month,state,city,date
count,10227,10227,10227,10227,10227
unique,120,12,21,117,242
top,LASALGAON(MS),February,MS,LASALGAON,September-2015
freq,242,930,4354,242,97


In [56]:
df.market.mode()

0    LASALGAON(MS)
Name: market, dtype: object

In [57]:
df.head()

Unnamed: 0,market,month,year,quantity,priceMin,priceMax,priceMod,state,city,date
0,ABOHAR(PB),January,2005,2350,404,493,446,PB,ABOHAR,January-2005
1,ABOHAR(PB),January,2006,900,487,638,563,PB,ABOHAR,January-2006
2,ABOHAR(PB),January,2010,790,1283,1592,1460,PB,ABOHAR,January-2010
3,ABOHAR(PB),January,2011,245,3067,3750,3433,PB,ABOHAR,January-2011
4,ABOHAR(PB),January,2012,1035,523,686,605,PB,ABOHAR,January-2012


In [67]:
df1 = df.loc[:,['state','market']].drop_duplicates()
df1.groupby('state')['market'].count().nlargest(1)

state
MS    38
Name: market, dtype: int64

In [69]:
df[df.market == 'LASALGAON(MS)'].groupby('year')['priceMax'].max()

year
1996     560
1997    1144
1998    2954
1999     664
2000     756
2001     889
2002     686
2003     929
2004     877
2005    1326
2006     897
2007    1580
2008    1258
2009    2013
2010    3749
2011    3060
2012    1390
2013    5040
2014    2310
2015    4616
2016    1407
Name: priceMax, dtype: int64

In [None]:
df[(df.state = 'PB1') & (df.priceMod < df.priceMod.mean()) &  (df.quantity < df.quantity.mean()) & df.date < df.date]

# Graphs

In [2]:
df = pd.read_csv("EmployeeAttrition.csv")

In [3]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [6]:
df.describe()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,802.485714,9.192517,2.912925,1.0,1024.865306,2.721769,65.891156,2.729932,2.063946,...,2.712245,80.0,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,403.5091,8.106864,1.024165,0.0,602.024335,1.093082,20.329428,0.711561,1.10694,...,1.081209,0.0,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,102.0,1.0,1.0,1.0,1.0,1.0,30.0,1.0,1.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,465.0,2.0,2.0,1.0,491.25,2.0,48.0,2.0,1.0,...,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,802.0,7.0,3.0,1.0,1020.5,3.0,66.0,3.0,2.0,...,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,1157.0,14.0,4.0,1.0,1555.75,4.0,83.75,3.0,3.0,...,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1499.0,29.0,5.0,1.0,2068.0,4.0,100.0,4.0,5.0,...,4.0,80.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


In [7]:
df['Age'].hist()

ImportError: matplotlib is required for plotting when the default backend "matplotlib" is selected.

In [8]:
m1=np.array([78,98,85])
m2=np.array([96,77,80])
m3=np.array([99,91,78])

In [11]:
(m1+m2+m3)/3


array([91.        , 88.66666667, 81.        ])

In [12]:
m1= np.array([[1,2],[3,4]])
m2 = np.array([[2,2],[8,4]])
m3 = np.array([[3,2],[5,4]])
m1+m2+m3

array([[ 6,  6],
       [16, 12]])

In [16]:
m4 = np.array([m1,m2,m3])
m4

array([[[1, 2],
        [3, 4]],

       [[2, 2],
        [8, 4]],

       [[3, 2],
        [5, 4]]])

In [17]:
m4.shape

(3, 2, 2)

In [19]:
print("sum()=",m4.sum())
print("sum(axis=0)",m4.sum(axis=0))
print("sum(axis=1)",m4.sum(axis=1))

sum()= 40
sum(axis=0) [[ 6  6]
 [16 12]]
sum(axis=1) [[ 4  6]
 [10  6]
 [ 8  6]]
