<a href="https://colab.research.google.com/github/hwmishra/andol/blob/master/data_science_day_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Numpy

### Broadcasting
* Subject to certain constraints, the smaller array is “broadcast” across the larger array so that they have compatible shapes

### NumPy 
* Numpy is the fundamental package for numerical computing with Python. It contains among other things:
* a powerful N-dimensional array object
* sophisticated (broadcasting) functions
* tools for integrating C/C++ and Fortran code
* useful linear algebra, Fourier transform, and random number capabilities

In [0]:
import numpy as np   # Importing libraries

a = np.array([0, 1, 2])
b = np.array([5, 5, 5])

print("Matrix A\n", a)
print("Matrix B\n", b)

print("Regular matrix addition A+B\n", a + b)

print("Addition using Broadcasting A+5\n", a + 5)

### Broadcasting Rules
When operating on two arrays, NumPy compares their shapes element-wise. It starts with the trailing dimensions, and works its way forward. Two dimensions are compatible when

1. they are equal, or
2.  one of them is 1


In [0]:
# Lets go for a 2D matrix
c = np.array([[0, 1, 2],[3, 4, 5],[6, 7, 8]])
d = np.array([[1, 2, 3],[1, 2, 3],[1, 2, 3]])

e = np.array([1, 2, 3])

print("Matrix C\n", c)
print("Matrix D\n", d)
print("Matrix E\n", e)

print("Regular matrix addition C+D\n", c + d)

print("Addition using Broadcasting C+E\n", c + e)

Matrix C
 [[0 1 2]
 [3 4 5]
 [6 7 8]]
Matrix D
 [[1 2 3]
 [1 2 3]
 [1 2 3]]
Matrix E
 [1 2 3]
Regular matrix addition C+D
 [[ 1  3  5]
 [ 4  6  8]
 [ 7  9 11]]
Addition using Broadcasting C+E
 [[ 1  3  5]
 [ 4  6  8]
 [ 7  9 11]]


In [0]:
M = np.ones((3, 3))
print("Matrix M:\n",M)

Matrix M:
 [[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]


In [0]:
print("Dimension of M: ",M.shape)
print("Dimension of a: ",a.shape)
print("Addition using Broadcasting")
print(M + a)
# Broadcasting array with matrix

Dimension of M:  (3, 3)
Dimension of a:  (3,)
Addition using Broadcasting
[[1. 2. 3.]
 [1. 2. 3.]
 [1. 2. 3.]]


## All in one program

In [0]:
# Importing libraries
import timeit

# Usage of builtin functions
start = timeit.default_timer()   

# Defining a list
array_list = [10,11,15,19,21,32]      
array_np_list = []

# Print the list
print("Original List",array_list,"\n")   

# Defining a function
def prime(num):      
    if num > 1:     
        
        # check for factors
        # Iterating a range of numbers
        for i in range(2,num):    
            if (num % i) == 0:
                
                # Appending data to list
                array_np_list.append(num)           
                print(num,"is not a prime number (",i,"times",num//i,"is",num,")")
                
                # Terminating a loop run
                break         
        else:
            print(num,"is a prime number")
            
# Iterating a list
for item in array_list:
    
    # Calling a function
    prime(item)         

print("\nNon-prime List",array_np_list,"\n")

end = timeit.default_timer()

# Computing running time
print("Time Taken to run the program:",end - start, "seconds")       

Original List [10, 11, 15, 19, 21, 32] 

10 is not a prime number ( 2 times 5 is 10 )
11 is a prime number
15 is not a prime number ( 3 times 5 is 15 )
19 is a prime number
21 is not a prime number ( 3 times 7 is 21 )
32 is not a prime number ( 2 times 16 is 32 )

Non-prime List [10, 15, 21, 32] 

Time Taken to run the program: 0.004166599999962273 seconds


### Note:
* Python is a procedural Language
* Two versions of Python 2 vs 3
* No braces. i.e. indentation
* No need to explicitly mention data type

## Unvectorized vs Vectorized Implementations

In [0]:
# Importing libraries
import numpy as np

# Defining matrices
mat_a = [[6, 7, 8],[5, 4, 5],[1, 1, 1]]
mat_b = [[1, 2, 3],[1, 2, 3],[1, 2, 3]]

# Getting a row from matrix
def get_row(matrix, row):
    return matrix[row]

# Getting a coloumn from matrix
def get_column(matrix, column_number):
    column = []
 
    for i in range(len(matrix)):
        column.append(matrix[i][column_number])
 
    return column

# Multiply a row with coloumn
def unv_dot_product(vector_one, vector_two):
    total = 0
 
    if len(vector_one) != len(vector_two):
        return total
 
    for i in range(len(vector_one)):
        product = vector_one[i] * vector_two[i]
        total += product
 
    return total

# Multiply two matrixes
def matrix_multiplication(matrix_one, matrix_two):
    m_rows = len(matrix_one)
    p_columns = len(matrix_two[0])
    result = []
    
    for i in range(m_rows):
        row_result = []
 
        for j in range(p_columns):
            row = get_row(matrix_one, i)
            column = get_column(matrix_two, j)
            product = unv_dot_product(row, column)
            
            row_result.append(product) 
        result.append(row_result)
        
    return result

print("Matrix A: ", mat_a,"\n")
print("Matrix B: ", mat_b,"\n")

print("Unvectorized Matrix Multiplication\n",matrix_multiplication(mat_a,mat_b),"\n")


Matrix A:  [[6, 7, 8], [5, 4, 5], [1, 1, 1]] 

Matrix B:  [[1, 2, 3], [1, 2, 3], [1, 2, 3]] 

Unvectorized Matrix Multiplication
 [[21, 42, 63], [14, 28, 42], [3, 6, 9]] 



In [0]:
# Vectorized Implementation
npm_a = np.array(mat_a)
npm_b = np.array(mat_b)

print("Vectorized Matrix Multiplication\n",npm_a.dot(npm_b),"\n") 
# A.dot(B) is a numpy built-in function for dot product

Vectorized Matrix Multiplication
 [[21 42 63]
 [14 28 42]
 [ 3  6  9]] 



### Tip:
* Vectorization reduces number of lines of code
* Always prefer libraries and avoid coding from scratch

## Essential Python Packages: Numpy, Pandas, Matplotlib

In [0]:
# Load library
import numpy as np

In [0]:
# Create row vector
vector = np.array([1, 2, 3, 4, 5, 6])
print("Vector:",vector)

# Select second element
print("Element 2 in Vector is",vector[1])

Vector: [1 2 3 4 5 6]
Element 2 in Vector is 2


In [0]:
np.flatten

In [0]:
# Create matrix
matrix = np.array([[1, 2, 3],
                   [4, 5, 6],
                   [7, 8, 9]])

print("Matrix\n",matrix)

# Select second row
print("Second row of Matrix\n",matrix[1,:])
print("Third coloumn of Matrix\n",matrix[:,2])

Matrix
 [[1 2 3]
 [4 5 6]
 [7 8 9]]
Second row of Matrix
 [4 5 6]
Third coloumn of Matrix
 [3 6 9]


In [0]:
# Create Tensor
tensor = np.array([ [[[1, 1], [1, 1]], [[2, 2], [2, 2]]],
                    [[[3, 3], [3, 3]], [[4, 4], [4, 4]]] ])

print("Tensor\n",tensor.shape)

Tensor
 (2, 2, 2, 2)


### Matrix properties

In [0]:
# Create matrix
matrix = np.array([[1, 2, 3],
                   [4, 5, 6],
                   [7, 8, 9]])

print("Matrix Shape:",matrix.shape)
print("Number of elements:",matrix.size)
print("Number of dimentions:",matrix.ndim)
print("Average of matrix:",np.mean(matrix))
print("Maximum number:",np.max(matrix))
print("Coloumn with minimum numbers:",np.min(matrix, axis=1))
print("Diagnol of matrix:",matrix.diagonal())
print("Determinant of matrix:",np.linalg.det(matrix))

Matrix Shape: (3, 3)
Number of elements: 9
Number of dimentions: 2
Average of matrix: 5.0
Maximum number: 9
Coloumn with minimum numbers: [1 4 7]
Diagnol of matrix: [1 5 9]
Determinant of matrix: 0.0


### Matrix Operations

In [0]:
print("Flattened Matrix\n",matrix.flatten())
print("Reshaping Matrix\n",matrix.reshape(9,1))
print("Transposed Matrix\n",matrix.T)

Flattened Matrix
 [1 2 3 4 5 6 7 8 9]
Reshaping Matrix
 [[1]
 [2]
 [3]
 [4]
 [5]
 [6]
 [7]
 [8]
 [9]]
Transposed Matrix
 [[1 4 7]
 [2 5 8]
 [3 6 9]]


In [0]:
# Create matrix
matrix_a = np.array([[1, 1, 1],
                     [1, 1, 1],
                     [1, 1, 2]])

# Create matrix
matrix_b = np.array([[1, 3, 1],
                     [1, 3, 1],
                     [1, 3, 8]])

print("Matrix Addition\n",np.add(matrix_a, matrix_b))
print("Scalar Multiplication\n",np.multiply(matrix_a, matrix_b))
print("Matrix Multiplication\n",np.dot(matrix_a, matrix_b))

Matrix Addition
 [[ 2  4  2]
 [ 2  4  2]
 [ 2  4 10]]
Scalar Multiplication
 [[ 1  3  1]
 [ 1  3  1]
 [ 1  3 16]]
Matrix Multiplication
 [[ 3  9 10]
 [ 3  9 10]
 [ 4 12 18]]


In [0]:
x = np.arange(5) 
print(x)

In [0]:
x = np.arange(5, dtype = float)
print(x)

In [0]:
# numbers with difference of 2
x = np.arange(10,20,2) 
print x

In [0]:
x = np.linspace(10,20,5) 
print(x)

##let's get a rid of statistics with numpy

In [0]:
# X is a Python List
X = [32.32, 56.98, 21.52, 44.32, 55.63, 13.75, 43.47, 43.34]

# Sorting the data and printing it.
X.sort()
print(X)
# [13.75, 21.52, 32.32, 43.34, 43.47, 44.32, 55.63, 56.98]

# Using NumPy's built-in functions to Find Mean, Median, SD and Variance
mean = np.mean(X)
median = np.median(X)
sd = np.std(X)
variance = np.var(X)

# Printing the values
print("Mean", mean) # 38.91625
print("Median", median) # 43.405
print("Standard Deviation", sd) # 14.3815654029
print("Variance", variance) # 206.829423437

###The tool min returns the minimum value along a given axis.

In [0]:
x =np.arrange()

In [0]:
my_array = np.array([[2, 5], 
                        [3, 7],
                        [1, 3],
                        [4, 0]])

print np.min(my_array, axis = 0)         #Output : [1 0]
print np.min(my_array, axis = 1)         #Output : [2 3 1 0]
print np.min(my_array, axis = None)      #Output : 0
print np.min(my_array)                   #Output : 0

###The tool max returns the maximum value along a given axis.

In [0]:
my_array = numpy.array([[2, 5], 
                        [3, 7],
                        [1, 3],
                        [4, 0]])

print np.max(my_array, axis = 0)         #Output : [4 7]
print np.max(my_array, axis = 1)         #Output : [5 7 3 4]
print np.max(my_array, axis = None)      #Output : 7
print np.max(my_array)                   #Output : 7

In [0]:
a = np.array([[1,2], [3, 4], [5, 6]])

bool_idx = (a > 2)   # Find the elements of a that are bigger than 2;
                     # this returns a numpy array of Booleans of the same
                     # shape as a, where each slot of bool_idx tells
                     # whether that element of a is > 2.

print(bool_idx)      # Prints "[[False False]
                     #          [ True  True]
                     #          [ True  True]]"

# We use boolean array indexing to construct a rank 1 array
# consisting of the elements of a corresponding to the True values
# of bool_idx
print(a[bool_idx])  # Prints "[3 4 5 6]"

# We can do all of the above in a single concise statement:
print(a[a > 2])     # Prints "[3 4 5 6]"

#NumPy Challenge
### You are given a 2-D array with dimensions N X M.
### Your task is to perform the min function over axis 1 and then find the max of that.
Sample Input

```
4 2
2 5
3 7
1 3
4 0
```

This is formatted as code

Sample Output

3

### Pandas

In [0]:
import pandas as pd

In [0]:
df=pd.read_csv("Income.csv", encoding="ISO-8859-1")
print("Data\n")
df

In [5]:
df.columns
df

Index(['id', 'State_Code', 'State_Name', 'State_ab', 'County', 'City', 'Place',
       'Type', 'Primary', 'Zip_Code', 'Area_Code', 'ALand', 'AWater', 'Lat',
       'Lon', 'Mean', 'Median', 'Stdev', 'sum_w'],
      dtype='object')

In [10]:
df[['State_Name','State_Code']]

Unnamed: 0,State_Name,State_Code
0,Alabama,1
1,Alabama,1
2,Alabama,1
3,Alabama,1
4,Alabama,1
5,Alabama,1
6,Alabama,1
7,Alabama,1
8,Alabama,1
9,Alabama,1


In [12]:
df[:5]

Unnamed: 0,id,State_Code,State_Name,State_ab,County,City,Place,Type,Primary,Zip_Code,Area_Code,ALand,AWater,Lat,Lon,Mean,Median,Stdev,sum_w
0,1011000,1,Alabama,AL,Mobile County,Chickasaw,Chickasaw city,City,place,36611,251,10894952,909156,30.77145,-88.079697,38773,30506,33101,1638.260513
1,1011010,1,Alabama,AL,Barbour County,Louisville,Clio city,City,place,36048,334,26070325,23254,31.708516,-85.611039,37725,19528,43789,258.017685
2,1011020,1,Alabama,AL,Shelby County,Columbiana,Columbiana city,City,place,35051,205,44835274,261034,33.191452,-86.615618,54606,31930,57348,926.031
3,1011030,1,Alabama,AL,Mobile County,Satsuma,Creola city,City,place,36572,251,36878729,2374530,30.874343,-88.009442,63919,52814,47707,378.114619
4,1011040,1,Alabama,AL,Mobile County,Dauphin Island,Dauphin Island,Town,place,36528,251,16204185,413605152,30.250913,-88.171268,77948,67225,54270,282.320328


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32526 entries, 0 to 32525
Data columns (total 19 columns):
id            32526 non-null int64
State_Code    32526 non-null int64
State_Name    32526 non-null object
State_ab      32526 non-null object
County        32526 non-null object
City          32526 non-null object
Place         32526 non-null object
Type          32526 non-null object
Primary       32526 non-null object
Zip_Code      32526 non-null int64
Area_Code     32526 non-null object
ALand         32526 non-null int64
AWater        32526 non-null int64
Lat           32526 non-null float64
Lon           32526 non-null float64
Mean          32526 non-null int64
Median        32526 non-null int64
Stdev         32526 non-null int64
sum_w         32526 non-null float64
dtypes: float64(3), int64(8), object(8)
memory usage: 4.7+ MB


In [0]:
print("Top Elements\n")
df.head(3)

Top Elements



Unnamed: 0,GEOID,State,2005,2006,2007,2008,2009,2010,2011,2012,2013
0,04000US01,Alabama,37150,37952,42212,44476,39980,40933,42590,43464,41381
1,04000US02,Alaska,55891,56418,62993,63989,61604,57848,57431,63648,61137
2,04000US04,Arizona,45245,46657,47215,46914,45739,46896,48621,47044,50602


In [0]:
print("Bottom Elements\n")
df.tail(3)

Bottom Elements



Unnamed: 0,GEOID,State,2005,2006,2007,2008,2009,2010,2011,2012,2013
3,04000US05,Arkansas,36658,37057,40795,39586,36538,38587,41302,39018,39919
4,04000US06,California,51755,55319,55734,57014,56134,54283,53367,57020,57528
5,04000US07,Chicago,-999,-999,-999,-999,-999,-999,-999,-999,-999


In [0]:
print("Specific Coloumn\n")
df['State'].head(3)

Specific Coloumn



0    Alabama
1     Alaska
2    Arizona
Name: State, dtype: object

In [0]:
print("Replace negative numbers with NaN\n")
df.replace(-999,np.nan)

Replace negative numbers with NaN



Unnamed: 0,GEOID,State,2005,2006,2007,2008,2009,2010,2011,2012,2013
0,04000US01,Alabama,37150.0,37952.0,42212.0,44476.0,39980.0,40933.0,42590.0,43464.0,41381.0
1,04000US02,Alaska,55891.0,56418.0,62993.0,63989.0,61604.0,57848.0,57431.0,63648.0,61137.0
2,04000US04,Arizona,45245.0,46657.0,47215.0,46914.0,45739.0,46896.0,48621.0,47044.0,50602.0
3,04000US05,Arkansas,36658.0,37057.0,40795.0,39586.0,36538.0,38587.0,41302.0,39018.0,39919.0
4,04000US06,California,51755.0,55319.0,55734.0,57014.0,56134.0,54283.0,53367.0,57020.0,57528.0
5,04000US07,Chicago,,,,,,,,,


In [17]:
df['State_ab'].value_counts()

CA    3280
TX    2300
NY    2160
FL    1661
PA    1475
IL    1431
OH    1349
MI    1174
NC     915
NJ     888
GA     818
VA     785
IN     704
MO     700
WA     688
WI     674
MA     670
MN     641
AZ     617
TN     610
MD     586
CO     526
AL     526
LA     511
KY     497
OK     480
SC     459
IA     454
OR     387
KS     381
PR     380
CT     355
AR     340
MS     321
NE     275
NV     265
UT     261
WV     248
NM     240
ME     157
ID     149
MT     148
HI     138
NH     131
SD     128
ND     119
RI     109
AK     105
DE      88
VT      83
WY      75
DC      64
Name: State_ab, dtype: int64

In [21]:
df[df['id']>101130]

Unnamed: 0,id,State_Code,State_Name,State_ab,County,City,Place,Type,Primary,Zip_Code,Area_Code,ALand,AWater,Lat,Lon,Mean,Median,Stdev,sum_w
0,1011000,1,Alabama,AL,Mobile County,Chickasaw,Chickasaw city,City,place,36611,251,10894952,909156,30.771450,-88.079697,38773,30506,33101,1638.260513
1,1011010,1,Alabama,AL,Barbour County,Louisville,Clio city,City,place,36048,334,26070325,23254,31.708516,-85.611039,37725,19528,43789,258.017685
2,1011020,1,Alabama,AL,Shelby County,Columbiana,Columbiana city,City,place,35051,205,44835274,261034,33.191452,-86.615618,54606,31930,57348,926.031000
3,1011030,1,Alabama,AL,Mobile County,Satsuma,Creola city,City,place,36572,251,36878729,2374530,30.874343,-88.009442,63919,52814,47707,378.114619
4,1011040,1,Alabama,AL,Mobile County,Dauphin Island,Dauphin Island,Town,place,36528,251,16204185,413605152,30.250913,-88.171268,77948,67225,54270,282.320328
5,1011050,1,Alabama,AL,Cullman County,Cullman,Dodge City,Town,place,35057,256,8913021,26837,34.045414,-86.882670,50715,42643,35886,173.325959
6,1011060,1,Alabama,AL,Escambia County,East Brewton,East Brewton city,City,place,36426,251,8826252,91015,31.091440,-87.055345,33737,23610,28256,758.771322
7,1011070,1,Alabama,AL,Elmore County,Coosada,Elmore,Town,place,36020,334,10222339,176500,32.544337,-86.336446,46319,40242,38941,397.052564
8,1011080,1,Alabama,AL,Morgan County,Eva,Eva,Town,place,35621,256,10544874,78981,34.326504,-86.765318,57994,39591,47235,137.496039
9,1011090,1,Alabama,AL,Talladega County,Sylacauga,Fayetteville,CDP,place,35151,256,45178321,6034534,33.168097,-86.442774,54807,41712,51359,380.728238


In [22]:
query = df['id']>101130
df[query]

Unnamed: 0,id,State_Code,State_Name,State_ab,County,City,Place,Type,Primary,Zip_Code,Area_Code,ALand,AWater,Lat,Lon,Mean,Median,Stdev,sum_w
0,1011000,1,Alabama,AL,Mobile County,Chickasaw,Chickasaw city,City,place,36611,251,10894952,909156,30.771450,-88.079697,38773,30506,33101,1638.260513
1,1011010,1,Alabama,AL,Barbour County,Louisville,Clio city,City,place,36048,334,26070325,23254,31.708516,-85.611039,37725,19528,43789,258.017685
2,1011020,1,Alabama,AL,Shelby County,Columbiana,Columbiana city,City,place,35051,205,44835274,261034,33.191452,-86.615618,54606,31930,57348,926.031000
3,1011030,1,Alabama,AL,Mobile County,Satsuma,Creola city,City,place,36572,251,36878729,2374530,30.874343,-88.009442,63919,52814,47707,378.114619
4,1011040,1,Alabama,AL,Mobile County,Dauphin Island,Dauphin Island,Town,place,36528,251,16204185,413605152,30.250913,-88.171268,77948,67225,54270,282.320328
5,1011050,1,Alabama,AL,Cullman County,Cullman,Dodge City,Town,place,35057,256,8913021,26837,34.045414,-86.882670,50715,42643,35886,173.325959
6,1011060,1,Alabama,AL,Escambia County,East Brewton,East Brewton city,City,place,36426,251,8826252,91015,31.091440,-87.055345,33737,23610,28256,758.771322
7,1011070,1,Alabama,AL,Elmore County,Coosada,Elmore,Town,place,36020,334,10222339,176500,32.544337,-86.336446,46319,40242,38941,397.052564
8,1011080,1,Alabama,AL,Morgan County,Eva,Eva,Town,place,35621,256,10544874,78981,34.326504,-86.765318,57994,39591,47235,137.496039
9,1011090,1,Alabama,AL,Talladega County,Sylacauga,Fayetteville,CDP,place,35151,256,45178321,6034534,33.168097,-86.442774,54807,41712,51359,380.728238


In [24]:
q1 =df['id']>101130
q2 = df['id']<1011040
df[(q1) & (q2)]

Unnamed: 0,id,State_Code,State_Name,State_ab,County,City,Place,Type,Primary,Zip_Code,Area_Code,ALand,AWater,Lat,Lon,Mean,Median,Stdev,sum_w
0,1011000,1,Alabama,AL,Mobile County,Chickasaw,Chickasaw city,City,place,36611,251,10894952,909156,30.771450,-88.079697,38773,30506,33101,1638.260513
1,1011010,1,Alabama,AL,Barbour County,Louisville,Clio city,City,place,36048,334,26070325,23254,31.708516,-85.611039,37725,19528,43789,258.017685
2,1011020,1,Alabama,AL,Shelby County,Columbiana,Columbiana city,City,place,35051,205,44835274,261034,33.191452,-86.615618,54606,31930,57348,926.031000
3,1011030,1,Alabama,AL,Mobile County,Satsuma,Creola city,City,place,36572,251,36878729,2374530,30.874343,-88.009442,63919,52814,47707,378.114619
54,101901,1,Alabama,AL,Chambers County,Wadley,Abanda,CDP,place,36276,256,7764034,34284,33.091627,-85.527029,14741,11398,10903,47.998400
55,101910,1,Alabama,AL,Pickens County,Aliceville,Aliceville city,City,place,35442,205,11819855,0,33.123686,-88.159364,28347,20997,28311,1057.617393
56,101920,1,Alabama,AL,Jefferson County,Odenville,Argo,Town,place,35120,205,27893577,150331,33.691576,-86.503766,242857,300000,25317,1.133107
57,101930,1,Alabama,AL,Lee County,Auburn,Auburn city,City,place,36830,334,152375113,2646161,32.607722,-85.489545,59694,38029,60946,13447.913832
58,101940,1,Alabama,AL,Mobile County,Bayou La Batre,Bayou La Batre city,City,place,36509,251,19400655,368667,30.407586,-88.263271,50047,35147,43788,519.274814
59,101950,1,Alabama,AL,Jefferson County,Bessemer,Bessemer city,City,place,35022,205,104798633,439841,33.370789,-86.971596,41239,29150,38565,7625.528576


In [30]:
df[df['id']>101130]['id']

0        1011000
1        1011010
2        1011020
3        1011030
4        1011040
5        1011050
6        1011060
7        1011070
8        1011080
9        1011090
10       1011100
11       1011110
12       1011120
13       1011130
14       1011140
15       1011150
16       1011160
17       1011170
18       1011180
19       1011190
20       1011200
21       1011210
22       1011220
23       1011230
24       1011240
25       1011250
26       1011260
27       1011270
28       1011280
29       1011290
          ...   
32496    7202736
32497    7202746
32498    7202756
32499     720276
32500    7202766
32501    7202776
32502    7202786
32503    7202796
32504    7202806
32505    7202816
32506    7202826
32507    7202836
32508    7202846
32509    7202856
32510     720286
32511    7202866
32512    7202876
32513    7202886
32514    7202896
32515    7202906
32516    7202916
32517    7202926
32518    7202936
32519    7202946
32520    7202956
32521     720296
32522    7202966
32523    72029

In [26]:
df.describe()

Unnamed: 0,id,State_Code,Zip_Code,ALand,AWater,Lat,Lon,Mean,Median,Stdev,sum_w
count,32526.0,32526.0,32526.0,32526.0,32526.0,32526.0,32526.0,32526.0,32526.0,32526.0,32526.0
mean,62037070.0,28.624885,50182.648404,116589300.0,6952054.0,37.731983,-91.303844,66703.986042,85452.938818,47273.695321,576.910273
std,111554600.0,16.297205,29410.122808,1280894000.0,209209300.0,5.57945,16.227588,30451.194599,87810.895132,16555.486882,3911.006939
min,1026.0,1.0,601.0,0.0,0.0,17.929085,-175.860041,0.0,0.0,0.0,0.0
25%,8021282.0,13.0,26362.0,1906991.0,0.0,34.013469,-97.664034,46015.5,36046.25,36075.0,201.436458
50%,29011680.0,29.0,48163.0,5022976.0,27033.5,38.925588,-87.13928,60738.0,51874.5,46179.0,329.482618
75%,48028990.0,42.0,76712.0,30909840.0,508207.8,41.495793,-79.852969,82223.5,80915.0,58078.0,590.22767
max,480221100.0,72.0,99950.0,91632670000.0,24532280000.0,71.2535,-65.500823,242857.0,300000.0,113936.0,612241.922964
