Chapter8 - Text Analytics

In [1]:
# Part 1: Text Preprocessing (Removing punctuation and digits)
s = "Hello!! 1984 is great, isn't it? So is 2018!!!"
import string
puncs = string.punctuation # strings !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
dgts = string.digits #strings 0123456789
table_p = str.maketrans(puncs, len(puncs) * " ") #make a mapping relation
table_d = str.maketrans(dgts, len(dgts) * " ")
print(s.translate(table_d))# remove all digits with " "
string.printable # all printable digits & charactors

Hello!!      is great, isn't it? So is     !!!


'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

In [None]:
# Part 2: Getting the word frequency & Removing stopwords using nltk
#2.1 Getting the word frequency
import nltk
nltk.download('punkt')
with open('uber_s1.txt') as f: # f 
    words = nltk.word_tokenize(f.read().lower()) #f.read() is str, nltk.word_tokenize split words, return lst

from collections import Counter
c = Counter(words) # take in a list, return a dict
c.most_common(20) # return a list of tupple

freq = nltk.FreqDist(words) #dict
print(freq) 
freq.plot(20)

#2.2 Removing stopwords using nltk
import nltk
nltk.download('stopwords') 
stopwords = nltk.corpus.stopwords.words('english') # return a list 
print(type(stopwords), len(stopwords), stopwords, sep="\n")

words2 = [] # our accumulator list
for w in words:
    if w not in stopwords and len(w) > 1:
        words2.append(w) # return a clean list
# list comprehension
words3 = [w for w in words if w not in stopwords and len(w) > 1]

#plot the clean list
freq2 = nltk.FreqDist(words2)
print('common words without stop words')
freq2.plot(20);

#2.3 Word cloud visualization with wordcloud
#!pip install wordcloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import nltk
%matplotlib inline
text = open('uber_s1.txt').read() # string
wordcloud = WordCloud(width=800, height=400).generate(text)
plt.figure(figsize=(10,10)) # set up figure size
plt.imshow(wordcloud) # word cloud image show
plt.axis("on") # turn on axis
plt.savefig('my_word_cloud.png') # save as PNG file
plt.savefig('my_word_cloud.pdf') # save as PDF file
plt.show()  # show in Jupyter notebook

# for a clean list
import nltk
stopwords = nltk.corpus.stopwords.words('english')
text2 = '' # our string accumulator
for word in text.split():
    if len(word) == 1 or word in stopwords:
        continue
    text2 = text2 + ' ' + word

In [None]:
#Part 3 Sentiment analysis using TextBlob module
#3.1 Sentiment Analysis using TextBlob
!pip install textblob
from textblob import TextBlob
s = 'BAIT 508 is a great class'
tb = TextBlob(s) # take in string. transform the the text into a tb object. 
print(tb.sentiment) # Sentiment(polarity=0.8, subjectivity=0.75)
tb.sentiment.polarity
tb.sentiment.subjectivity
#3.2 sentiment on a news article
from textblob import TextBlob
with open('data/uber_article.txt') as infile:
    content = infile.read()

tb = TextBlob(content)
sentences = tb.sentences # to split the sentences of a text, return a list of tb sentences (tb objects)

#check some high polarity sentences
for s in sentences:
    tb = TextBlob(str(s)) # take in sentence (need to be transformed back to string)
    pol = tb.sentiment.polarity
    if abs(pol) > 0.4:
        print(pol)
        print(s)
        print()
#collect sentiment scores for each sentence
sub_list = []
pol_list = []

for s in sentences:
    tb = TextBlob(str(s))
    sub_list.append(tb.sentiment.subjectivity)
    pol_list.append(tb.sentiment.polarity)
    
#visualize
import matplotlib.pyplot as plt

plt.hist(sub_list, bins=10) #, normed=1, alpha=0.75)

plt.xlabel('subjectivity score')
plt.ylabel('sentence count')
plt.grid(True)
plt.savefig('subjectivity.pdf')
plt.show()

plt.hist(pol_list, bins=10) #, normed=1, alpha=0.75)

plt.xlabel('polarity score')
plt.ylabel('sentence count')
plt.grid(True)
plt.savefig('polarity.pdf')
plt.show()

In [None]:
#Part 4 POS & NER
#4.1 Tokenization and POS tagging with nltk
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from pprint import pprint

with open('data/uber_article.txt') as infile:
    article = infile.read()
sentences = nltk.sent_tokenize(article) # input: text string, return list of string sentences.
# Tokenize each sentence into words: token_sentences
token_sentences = [word_tokenize(sent) for sent in sentences] # return a list of list of words

nltk.download('averaged_perceptron_tagger')
# Tag each tokenized sentence into parts of speech: pos_sentences
pos_sentences = [nltk.pos_tag(sent) for sent in token_sentences] # return a list of list of tupples)

#Check the tags:
nltk.download('tagsets')
nltk.help.upenn_tagset('DT')
nltk.help.upenn_tagset('NNP')

#4.2 Spacy with NER
!pip install spacy
import spacy
!python3 -m spacy download en
%matplotlib inline

import matplotlib.pyplot as plt
import spacy
from collections import defaultdict
from pprint import pprint

nlp = spacy.load('en', tagger=False, parser=False, matcher=False)

with open('data/uber_s1.txt') as infile:
    article = infile.read()
doc = nlp(article)
doc.ents

# create an empty defaultdict
ner_categories = defaultdict(int)

# Print all of the found entities and their labels
for ent in doc.ents:
    print('label', ent.label_, '\ttext', ent.text)
    ner_categories[ent.label_] += 1
    #print(ent.label_, ent.text)    
ner_categories

# Create a list from the dictionary keys for the chart labels: labels
labels = list(ner_categories.keys())

# Create a list of the values: values
values = [ner_categories.get(l) for l in labels]
#values = list(ner_categories.values())

# Create the pie chart
plt.figure(figsize=(10, 8))
plt.pie(values, 
        labels=labels, 
        autopct='%1.1f%%', 
        startangle=140)

# Display the chart
plt.show()

Chapter9 Data Science Management

#1 What is Data Science?
#From Wikipedia:
#A multi-disciplinary field that:
#Uses scientific methods, processes, algorithms, and systems
#To extract knowledge and insights
#From structured and unstructured data

#2 Why now? Big Data + Powerful Computers

#3 Data science workflow:
#Data collection : Customer survey, Clickstream data, Sales, transactions
#Data exploration and visualization : Dashboards, One-off reports
#Prediction model and experimentation: Hypothesis testing with A/B testing, Build machine learning models for prediction

#4 Building Data Science Team
#Data engineer: Control data flow, information architecture, build storage solutions, maintain data access, Use SQL, Java/Scala/Python to process data and automate
#Data analyst: Create dashboards, hypothesis testing, data visualization, Use Spreadsheets (Excel), SQL for large-scale analysis, BI tools (Tableau, Power BI, Looker)
#ML scientist: Build prediction models, classifications (e.g., stock price prediction, image processing, sentiment analysis), Usually need CS-like background. Use Python and R

#5 Data collection and storage
#Data storage and databases:
#Parallel storage solutions: Multiple computers in a cluster on premise (“on-prem”); On the Cloud: Microsoft Azure, Amazon Web Services, Google Cloud
#Types of storage: Unstructured data: Email, text, videos, audios, web pages, social media, stored in “document database” (e.g., MongoDB); Tabular (Structured) data: relational database (e.g., MySQL, Oracle)
#Databases: Document database: NoSQL (not only SQL), Relational database: SQL (structured query language)
#Decision points: Physical location (on-prem, cloud)? Data types?

#6 Machine Learning Concepts
#What is Machine Learning (ML)?
Use data to develop statistical models that can be used to predict various outcomes for new data
vs. codifying human knowledge w/ explicit instructions (for loop, if condition, etc.)
A.k.a. predictive modeling, data mining, predictive analytics
An example: spam filtering
Predict if an email is spam or not
Datasets: features and labels
Features: Data that might predict the label (e.g., email addr, domain, text)
Labels: the correct answers to learn from (e.g., spam or not)
#Supervised machine learning
Predictions from data with “features” (A) and “labels” (B)
Learning A-to-B mapping
Examples:
Given an email, predict if it is spam or not (spam detection)
Given a Facebook photo, identify the persons in it (face recognition)
Given a customer and behavior, predict if the customer will churn or not
Algorithms:
Classification (labels=discrete values): logistic regression, support vector machines, k-nearest neighbors, etc.
Regression (labels=continuous values): linear regression, random forest, etc.
Some algorithms can do both classification and regression

In [None]:
#part 2
!pip install textblob --user
#1. Sentiment Analysis
from textblob import TextBlob
sent = 'BAIT 508 is a great class'
tb = TextBlob(sent)
# sentiment scores
print(tb.sentiment)
# polarity score
print(tb.sentiment.polarity)
# subjectivity score
print(tb.sentiment.subjectivity)
s_neg = 'this is not a great class!!!'
tb_neg = TextBlob(s_neg)
print(tb_neg.sentiment)
tb = TextBlob('I was like happy and stuff')
tb.sentiment

#2. Language translation with TextBlob
s = 'BAIT 508 is a great class. Sauder MBAN students are awesome!'
tb = TextBlob(s)
print(tb.detect_language())
import time

s = 'BAIT 508 is a great class. Sauder MBAN students are awesome!'
tb = TextBlob(s)

lst_language = ['ko', 'cs', 'es', 'zh', 'hi', 'id', 'ru', 'ar', 'th', 'vi', 'ja']
for lang in lst_language:
    print(tb.translate(to=lang))
    time.sleep(1)

#3. Spell correction with TextBlob
b = TextBlob("I havv goood speling!")
print(b.correct())
b = TextBlob("I m trying mi best to make prfect spellling")
print(b.correct())

#4. Text summarization.
!pip install gensim --user
from gensim.summarization import keywords
from gensim.summarization import summarize
with open('uber_article.txt') as f:
    uber_text = f.read()
print(len(uber_text), 'characters in the text')
print(summarize(uber_text))
print(summarize(uber_text, ratio=0.05))
print(keywords(uber_text))

Chapter7 Import Data

In [None]:
#Part 1 Import flat files
#1.1 Exploring your working directory
!ls
#1.2 Importing entire text files
# Open a file: file
file = open('data/moby_dick.txt', mode='r')

#check charactors
with open("feed_header_2017-2019.tsv", "r") as infile:
    text = infile.read()
ans1 = int(len(text))
print(ans1)

#check words
ans2 = int(len(text.split()))
print(ans2)

#check lines: 
with open("feed_header_2017-2019.tsv", "r") as infile:
    lines = infile.readlines()
ans3 = int(len(lines))
print(ans3)

# Check whether file is closed
print(file.closed)

#1.3 Writing a text file
outfile = open('bait508_hw1.txt', 'w')
outfile.write('Lee, Gene, email@gmail.com\n')
outfile.write('ans1=100\n')
outfile.write('ans2=200\n')
outfile.close()

#1.4 Use context manager with and import text files line by line
with open('data/moby_dick.txt') as file:
    print(file.readline())
    print(file.readline())
    print(file.readline())
    
#1.5 Go over a huge file line by line
with open('data/blockchain.txt') as file:
    for line in file: # this won't load the whole text into the RAM
        print(line[0:80]) # print the first 80 characters in the line

#1.6 
!head data/mnist_test.csv
!head data/seaslug.txt
!head data/titanic.tsv

# Import pandas as pd
import pandas as pd

# Assign the filename: file
file = 'data/titanic.tsv'

# Read the file into a DataFrame: df
df = pd.read_csv(file, sep='\t')

# View the head of the DataFrame
print(df.head())



In [None]:
#Part 2 Import Flat files from Web
#2.1 Importing flat files from the web
# Import package
from urllib.request import urlretrieve
# Import pandas
import pandas as pd
# Assign url of file: url
url = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1606/datasets/winequality-red.csv'
# Save file locally
urlretrieve(url, 'winequality-red.csv')
# Read file into a DataFrame and print its head
df = pd.read_csv('winequality-red.csv', sep=';')
df.head()

#2.2 Opening and reading flat files from web (without storing locally)
# Import packages
import matplotlib.pyplot as plt
import pandas as pd
# Assign url of file: url
url = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1606/datasets/winequality-red.csv'
# Read file into a DataFrame: df
df = pd.read_csv(url, sep=';')
# Print the head of the DataFrame
df.head()
# Plot first column of df
pd.DataFrame.hist(df.iloc[:, 0:1])
plt.xlabel('fixed acidity (g(tartaric acid)/dm$^3$)')
plt.ylabel('count')
plt.show()

####2.3 Parsing webpage with requests and BeautifulSoup!!
#2.1 Download webpage with requests
# Import packages
import requests
from bs4 import BeautifulSoup

# Specify url: url
url = 'https://www.sauder.ubc.ca/thought-leadership/divisions/accounting-information-systems/people'

# Package the request, send the request and catch the response: r
r = requests.get(url)
type(r)
#2.2 Parse the text with BeautifulSoup
# Extracts the response as html: html_doc
html_doc = r.text
print(type(r))

# Create a BeautifulSoup object from the HTML: soup
soup = BeautifulSoup(html_doc, "lxml")
print(soup)
# Prettify the BeautifulSoup object: pretty_soup
pretty_soup = soup.prettify()

# Print the response
print(pretty_soup)

# let's check webpage title
print(soup.title)

# let's now check webpage text
text = soup.get_text()
print(text)
with open('web_text.txt', 'w') as outfile:
    outfile.write(text)
    
# Find all 'div' tags <div class="profile">
div_tags = soup.find_all('div', {'class': 'profile'})
print(type(div_tags))
print(div_tags)
print(len(div_tags))

# if you want to find tags without specific parameters condition, 
# div_tags = soup.find_all('div')

for tag in div_tags:
    #print(link)
    #try:
    name = tag.find('h4').get_text().strip()
    email = tag.find('div', {'class': 'profile__content__position'}).get_text().strip()
    print(name, email)
    #except:
    #    continue

# Import package
import pandas as pd
url = 'http://s3.amazonaws.com/assets.datacamp.com/course/importing_data_into_r/latitude.xls'
xl = pd.read_excel(url, sheet_name=None)
print(xl.keys())
print(xl['1700'].head())

import requests
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/Standard_Industrial_Classification"
r = requests.get(url)
html_doc = r.text
soup = BeautifulSoup(html_doc, "lxml")
tables = soup.find_all('table')
first_table = tables[0]
trs = first_table.find_all('tr')
sic_name = dict()
for tr in trs:
    try:
        tds = tr.find_all('td')
        k = tds[0].get_text().strip()
        v = tds[1].get_text().strip()
        sic_name[k] = v
    except:
        continue

sic_name


In [None]:
#Part 3 API & JSON
#1. Introduction to APIs and JSONs
import json
from pprint import pprint 

# Load JSON: json_data
with open("a_movie.json") as json_file:
    json_data = json.load(json_file)

print(type(json_data))

for k, v in json_data.items():
    print(k + ': ', v)
    
#2. APIs and interacting with WWW
# Import requests package
import requests
from pprint import pprint

# Assign URL to variable: url
url = 'http://www.omdbapi.com/?apikey=72bc447a&t=the+social+network'

# Package the request, send the request and catch the response: r
r = requests.get(url)
print(r.text)

#2.2 Convert requests response to JSON
# Decode the JSON data into a dictionary: json_data
json_data = r.json()

print(type(json_data))

pprint(json_data)


Chapter6 Pandas + EDA

In [None]:
#Part1. Data frame basics
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
filepath = "1st_class.csv"
df = pd.read_csv(filepath)
df
df.shape
df.columns
df.info()
df.index
df.isnull().sum()
df.dropna(axis=1, how="all", inplace = True)
df.fillna("", inplace = True)
df.rename(columns={'pup':'pop'},inplace=True)
df.iloc[:5,:]
df.tail()
filepath = "1st_class.csv"
df_country = pd.read_csv(filepath, index_col='country')
df_country.rename(columns={'pup':'pop'},inplace=True)
df_country.index
df_canada = df_country.loc['Canada',:]
df_canada.head(5)
df_canada['continent'].str.upper()
df_canada[df_canada['continent'].str.contains('A')]
del df_canada['continent']
df_canada.head(5)
df_canada.describe()
df_canada['pop'].count()
df_canada['lifeExp'].mean()
df_canada['gdpPercap'].std()
df_canada['gdpPercap'].quantile(0.75) - df_canada['gdpPercap'].quantile(0.25)
plt.plot(df_canada.year, df_canada.lifeExp, label="Canada")
plt.xlabel("Year")
plt.ylabel("Life Expectancy")
plt.title("Life Expectancy in Canada")
plt.legend(loc="lower right")
plt.show()
df_us = df[df['country']=='United States']
plt.plot(df_canada.year, df_canada.lifeExp, label='Canada')
plt.plot(df_us.year, df_us.lifeExp, label='US')
plt.xlabel("Year")
plt.ylabel("Life Expectancy")
plt.legend(loc="upper left")
plt.title("Life Expectancy in Canada and US")
plt.show()
df_canada.corr()
plt.show()
plt.figure(figsize=(20,10))
plt.matshow(df_canada.corr(), cmap=plt.get_cmap('summer'), fignum=1)
plt.xticks(range(df_canada.shape[1]), df_canada.columns, fontsize=10, rotation=0)
plt.yticks(range(df_canada.shape[1]), df_canada.columns, fontsize=10, rotation=0)
plt.colorbar(orientation="vertical")
plt.show()

star_review = yelp_final_filter[['stars','review_count']]
star_review.corr()
plt.figure(figsize=(10,5))
plt.matshow(star_review.corr(), cmap='gray', fignum=1)
plt.colorbar(orientation="vertical")
plt.xticks(range(star_review.shape[1]), star_review.columns, fontsize=10, rotation=0)
plt.yticks(range(star_review.shape[1]), star_review.columns, fontsize=10, rotation=0)
plt.title("Correlation matrix between stars and review count")
plt.show()

In [None]:
#Part2. Concarteneation Merge
df = pd.concat([df_1, df_2], ignore_index=False)
df.tail()
#ignore_index = True means it will clear the existing index and reset it
df = pd.concat([df_1, df_2], ignore_index=True, axis = 1)
df.head(2)
#axis=1 means it will concatenate along the columns.
#axis=0 means it will concatenate along the rows.
df_continent_left = pd.merge(df, continent, on=['country'], how='left')
df_continent_left
df_continent_right = pd.merge(df, continent, on=['country'], how='right')
df_continent_right
df_continent = pd.merge(df, continent, on=['country'], how='left', indicator=True)
df_continent.head(2)
df_continent['_merge'].unique()
#Subset
df_continent['country']=='Canada'
df_canada = df_continent[df_continent['country']=='Canada']
df_canada = df_continent[df_continent['country'].isin(['Canada'])]
df_canada['no_meaning']=1
#Groupby
df_continent.groupby('country').count()
df_continent.groupby('country').sum()
df_grouped=df_continent.groupby(["country","year"]).agg("mean")
df_continent.groupby(['country','year']).mean().index
df_grouped.reset_index()

Chapter5 loop

In [None]:
#1.1 For Loop
#1. for loop patterns
#1.1 Iteration loop pattern
words = ['stop', 'desktop', 'post', 'top']
for word in words:
    if 'top' in word:
        print(word)
#1.2 Counter Loop
s = 'hello'
for i, c in enumerate(s):
    print(i, c)
#1.3Acumulator loop pattern
import time

lst = [3, 2, 7, 1, 9]

res = 0
for num in lst:
    print('{} + {} = {}'.format(res, num, res+num))
    res = res + num
    time.sleep(1)
print(res)

#2 continue and break statements
table = [
    [2, 3, 0, 6],
    [0, 3, 4, 5],
    [4, 5, 6, 0]]
for row in table:
    for item in row:
        print(item, end=' ')
    print()
    
for row in table:
    for num in row:
        if num == 0:
            break   # break out of the current for loop
        print(num, end=' ')
    print()

for row in table:
    for num in row:
        if num == 0:
            continue # continue with the current for loop
        print(num, end=' ')
    print()
    
# While loop:
#1.1 basic while loop pattern
import time

i = 7
while i <= 37:
    i += 7
    print('in while loop, i=', i)
    time.sleep(1)
print(i)

#4.2 Sequence loop pattern
def fibo(num):
    first = 1
    second = 1
    while second < num:
        first, second = second, first + second
    return second
#4.3 Infinite loop pattern
while True:
    name = input('What is your name?')
    print('Hello {}'.format(name))
    
#4.4 Loop-and-a-half pattern
def cities():
    lst = []
    while True:
        city = input("Enter city name: ")
        if city == '':
            return lst # final return, will not loop anymore
        else:
            lst.append(city)

In [None]:
#Part 2 Function
# Define shout_all with parameters word1 and word2
def shout_all(word1, word2):
    
    # Concatenate word1 with '!!!': shout1
    shout1 = word1 + '!!!'
    
    # Concatenate word2 with '!!!': shout2
    shout2 = word2 + '!!!'
    
    # Construct a tuple with shout1 and shout2: shout_words
    shout_words = (shout1, shout2)

    # Return shout_words
    return shout_words
yell1, yell2 = shout_all('congratulations', 'you')

df = pd.read_csv('tweets.csv')
df.iloc[0]['lang']
# Initialize an empty dictionary: langs_count
langs_count = {}

# Iterate over lang column in DataFrame
for index, tweet in df.iterrows():
    lang = tweet['lang']
    #print(lang)
    
    # If the language is in langs_count, add 1 
    if lang in langs_count.keys():
        langs_count[lang] += 1
    # Else add the language to langs_count, set the value to 1
    else:
        langs_count[lang] = 1

# Print the populated dictionary
print(langs_count)
# dict[char] = dict.get(char,0) +1
# sorted(dict.items, key = lambda x:x[1])

from pprint import pprint
import pandas as pd

# Define count_entries()
def count_entries(filename, col_name):
    """Return a dictionary with counts of 
    occurrences as value for each key."""

    # Import Twitter data as DataFrame: df
    df = pd.read_csv(filename, index_col=0)

    # Initialize an empty dictionary: langs_count
    counter = {}

    # Iterate over lang column in DataFrame
    for index, tweet in df.iterrows():
        lang = tweet[col_name]
        #print(lang)

        # If the language is in langs_count, add 1 
        if lang in counter.keys():
            counter[lang] += 1
        # Else add the language to langs_count, set the value to 1
        else:
            counter[lang] = 1

    # Print the populated dictionary
    return counter

from pprint import pprint
import pandas as pd


def count_entries(filename, col_name):
    df = pd.read_csv(filename, index_col=0)
    counter = {}
    for index, tweet in df.iterrows():
        lang = tweet[col_name]

        if lang in counter.keys():
            counter[lang] += 1
        else:
            counter[lang] = 1

    return counter

dict_res = count_entries('tweets.csv', 'source')
result = sorted(dict_res, key=dict_res.get, reverse = True)
result2 = sorted(dict_res.items(), key= lambda x:x[1], reverse = True)

n_items = result2[:3]


print (n_items)

def is_prime(n):
    res = True
    if n == 1:
        res = False
    else:
        for i in range (2,n):
            if n % i == 0:
                res = False
                break
    return res

print (is_prime(31))   

def find_largest_prime(n):
    for i in range (1,n):
        if is_prime(n-i)== True:
            return n-i

print (find_largest_prime(100))


# Method 1
def acronym(string):
    lst = string.split()
    res = ''
    for word in lst:
        res = (res + word[0]).upper()
    return res

print (acronym("GNU's not UNIX"))


# Method 2
def acr (string):
    lst = string.split()
    return ("".join(list(word[0].upper() for word in lst)))

print (acr("GNU's not UNIX"))

# method 1
def arithmetic (lst):
    gap = 0
    status = True
    if len(lst) <= 2:
        status = True
    else:
        gap = lst[1]-lst[0]
        for i in range(1,len(lst)):
            if lst[i]-lst[i-1] != gap:
                status = False
                break
    return status

print (arithmetic([3, 6, 9, 11, 14]))


# method 2
def arit (lst):
    if len(lst) <= 2:
        return True
    else:
        return len(set([lst[x + 1] - lst[x] for x in range(len(lst) - 1)])) == 1

print (arit ([3, 6, 9, 11, 14]))
    


chapter4 logic flow

In [None]:
#Compare arrays
# Create arrays
import numpy as np

my_house = np.array([18.0, 20.0, 10.75, 9.50])
your_house = np.array([14.0, 24.0, 14.25, 9.0])
# my_house greater than or equal to 18
print(my_house >= 18) # [ True  True False False]
# my_house less than your_house
print(my_house < your_house) #[False  True  True False]
#Notice that not has a higher priority than and and or, it is executed first.
# my_house greater than 18.5 or smaller than 10
print(my_house > 18.5)
print(my_house < 10)
print('or')
print(np.logical_or(my_house > 18.5, my_house < 10))
import numpy as np

# Create medium: observations with cars_per_cap between 100 and 500
medium = cars[np.logical_and(cars['cars_per_cap'] > 100, 
                             cars['cars_per_cap'] < 500)]

right = np.mean(cars[cars["drives_right"]== True]["cars_per_cap"])
left = np.mean(cars[cars["drives_right"]== False]["cars_per_cap"])
print(right,left)

import time

# areas list
areas = [11.25, 18.0, 20.0, 10.75, 9.50]

# Change for loop to use enumerate() and update print()
for index, a in enumerate(areas) :
    print("room {}: {}".format(index, a))
    time.sleep(1)
    
# house list of lists
house = [["hallway", 11.25], 
         ["kitchen", 18.0], 
         ["living room", 20.0], 
         ["bedroom", 10.75], 
         ["bathroom", 9.50]]

for lst in house:
    print ("the "+ str(lst[0]) + " is " + str(lst[1]) + " sqm")

In [None]:
#Loop over different structures:
#Dict
for k, v in europe.items():
    print('the capital of {} is {}'.format(k, v))
#Dataframe
for row in cars:
    print(row) # loop over column names cars_per_cap,country,drives_right
# Iterate over rows of cars
for label, row in cars.iterrows():
    print(label)
# Adapt for loop
for lab, row in cars.iterrows() :
    print('{}: {}'.format(lab, row['cars_per_cap']))
    
    # Code for loop that adds COUNTRY column

#Add new column
for lab, row in cars.iterrows():
    cars.loc[lab, 'COUNTRY'] = row['country'].upper()

# Print cars
cars

# Use .apply(str.upper)
cars["COUNTRY2"] = cars["country"].apply(str.upper)
cars["COUNTRY_LENGTH"] = cars["country"].apply(len)

#loop over DataFrame with multiple conditions
import numpy as np
cars[np.logical_and(cars["cars_per_cap"]>100,cars["drives_right"]==True)]

for lab, row in cars.iterrows():
    if np.logical_and(cars.loc[lab,"cars_per_cap"]>100, cars.loc[lab,"drives_right"] == True):
        print (cars.loc[[lab]])
        
#loop over numpy
import pandas as pd
import numpy as np

df = pd.read_csv('mlb_sample.csv')
np_height = np.array(df['Height(inches)'].values)
np_baseball = np.array(df['Weight(pounds)'].values)
# For loop over np_height
for h in np.nditer(np_height):
    print("{} inches".format(h))
# For loop over np_baseball
for b in np.nditer(np_baseball):
    print(b)

Chapter3 Matplotlib & Dictionary

In [None]:
#Part1 
#Lineplot
import pandas as pd
df1 = pd.read_csv('world_year_pop.csv')
plt.plot(df1.years, df1.populations)
plt.show()
#Scatterplot
plt.scatter(df2.gdp_cap, df2.life_exp)
# Put the x-axis on a logarithmic scale
plt.xscale('log')
plt.show()
#histgram
import pandas as pd
df3 = pd.read_csv('country_life_gdp.csv')
# Create histogram of life_exp data
plt.hist(df3.life_exp,bins=5)
# Display histogram
plt.show()

#Compare 2
plt.hist(df4.life_1950, bins=15, alpha=0.4)
plt.hist(df4.life_2007, bins=15, alpha=0.4)
plt.show()

In [None]:
#Part2 Dictionary
print(europe.keys())
print(europe.values())
# Remove australia
del(europe['australia'])

#Pandas
cars = pd.read_csv('cars.csv', index_col=0)
# Print out country column as Pandas Series
cars['country']
# Print out country column as Pandas DataFrame
cars[['country']]
# Print out DataFrame with country and drives_right columns
cars[['country', 'drives_right']]
# Print out first 3 observations
cars[:4]
# Print out observation for Japan as Pandas series
cars.loc['JAP']
# Print out observation for Japan as dataframe
cars.loc[['JAP']]
# Print out observations for Australia and Egypt
cars.loc[['AUS', 'EG']]
# Print out drives_right value of Morocco
cars.loc['MOR', 'drives_right']
# Print out drives_right value of Morocco as dataframe
cars.loc[['MOR'], ['drives_right']]
# Print sub-DataFrame
cars.loc[['RU', 'MOR'], ['country', 'drives_right']]
# Print out drives_right column as Series
cars.loc[:, 'drives_right']
# Print out drives_right column as DataFrame
cars.loc[:, ['drives_right']]
# Print out cars_per_cap and drives_right as DataFrame
cars.loc[:, ['cars_per_cap', 'drives_right']]
cars.loc[:, 'cars_per_cap'].values.mean()
lst = ["A","E","I","O","U"]
cars[cars["country"].str.startswith(tuple(lst))]["cars_per_cap"].values.mean()
cars.loc[["US","AUS","IN","EG"], 'cars_per_cap'].values.mean()