In [1]:
import pandas as pd

# 1. Writing your own functions

## 1.1 Loading a dataset and develop functionalities to extract simple insights from the data

In [2]:
tweets_df = pd.read_csv('./datasets/tweets.csv')

langs_count = {}

col = tweets_df['lang']

for entry in col:    
    if entry in langs_count.keys():
        langs_count[entry] += 1
    else:
        langs_count[entry] = 1

print(langs_count)
    

{'und': 2, 'en': 97, 'et': 1}


## 1.2 Defining a function

In [3]:
def count_entries(df, col_name):
    """Return a dictionary with counts of
    occurrences as value for each key."""
    
    langs_count = {}
    
    col = df[col_name]
    
    for entry in col:
        if entry in langs_count.keys():
            langs_count[entry] += 1
        else:
            langs_count[entry] = 1
    return langs_count

In [4]:
result = count_entries(tweets_df, 'lang')

print(result)

{'und': 2, 'en': 97, 'et': 1}


# 2. Default arguments, variable-length arguments and scope

## 2.1 Generalizing the Twitter language analysis: default argument

In [5]:
def count_entries(df, col_name='lang'):
    """Return a dictionary with counts of 
    occurrences as value for each key."""
    
    cols_count = {}
    
    col = df[col_name]
    
    for entry in col:
        if entry in cols_count.keys():
            cols_count[entry] += 1
        else:
            cols_count[entry] = 1
    return cols_count

In [6]:
result1 = count_entries(tweets_df, col_name='lang')

result2 = count_entries(tweets_df, col_name='source')

print(result1)
print(result2)

{'und': 2, 'en': 97, 'et': 1}
{'<a href="http://ifttt.com" rel="nofollow">IFTTT</a>': 1, '<a href="http://www.facebook.com/twitter" rel="nofollow">Facebook</a>': 1, '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>': 33, '<a href="http://www.myplume.com/" rel="nofollow">Plume\xa0for\xa0Android</a>': 1, '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>': 26, '<a href="http://linkis.com" rel="nofollow">Linkis.com</a>': 2, '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>': 24, '<a href="http://twitter.com/#!/download/ipad" rel="nofollow">Twitter for iPad</a>': 6, '<a href="http://www.twitter.com" rel="nofollow">Twitter for BlackBerry</a>': 2, '<a href="http://rutracker.org/forum/viewforum.php?f=93" rel="nofollow">newzlasz</a>': 2, '<a href="http://www.google.com/" rel="nofollow">Google</a>': 2}


## 2.2 Passing a flexible argument (*args)

In [7]:
def count_entries(df, *args):
    """return a dictionary with counts of
    occurrences as value for each key."""
    
    cols_count = {}
    
    for col_name in args:
        col = df[col_name]
        for entry in col:
            if entry in cols_count.keys():
                cols_count[entry] += 1
            else:
                cols_count[entry] = 1
    return cols_count

In [8]:
result1 = count_entries(tweets_df, 'lang')

result2 = count_entries(tweets_df, 'source')

print(result1)
print(result2)

{'und': 2, 'en': 97, 'et': 1}
{'<a href="http://ifttt.com" rel="nofollow">IFTTT</a>': 1, '<a href="http://www.facebook.com/twitter" rel="nofollow">Facebook</a>': 1, '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>': 33, '<a href="http://www.myplume.com/" rel="nofollow">Plume\xa0for\xa0Android</a>': 1, '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>': 26, '<a href="http://linkis.com" rel="nofollow">Linkis.com</a>': 2, '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>': 24, '<a href="http://twitter.com/#!/download/ipad" rel="nofollow">Twitter for iPad</a>': 6, '<a href="http://www.twitter.com" rel="nofollow">Twitter for BlackBerry</a>': 2, '<a href="http://rutracker.org/forum/viewforum.php?f=93" rel="nofollow">newzlasz</a>': 2, '<a href="http://www.google.com/" rel="nofollow">Google</a>': 2}


# 3. Lambda functions and error-handling

## 3.1 Writing a lambda function

In [9]:
# Select retweets from the Twitter dataframe: result
result = filter(lambda x: x[0:2]=='RT', tweets_df['text'])

res_list = list(result)

# Print all retweets in res_list
for tweet in res_list:
    print(tweet)

RT @bpolitics: .@krollbondrating's Christopher Whalen says Clinton is the weakest Dem candidate in 50 years https://t.co/pLk7rvoRSn https:/…
RT @HeidiAlpine: @dmartosko Cruz video found.....racing from the scene.... #cruzsexscandal https://t.co/zuAPZfQDk3
RT @AlanLohner: The anti-American D.C. elites despise Trump for his America-first foreign policy. Trump threatens their gravy train. https:…
RT @BIackPplTweets: Young Donald trump meets his neighbor  https://t.co/RFlu17Z1eE
RT @trumpresearch: @WaitingInBagdad @thehill Trump supporters have selective amnisia.
RT @HouseCracka: 29,000+ PEOPLE WATCHING TRUMP LIVE ON ONE STREAM!!!

https://t.co/7QCFz9ehNe
RT @urfavandtrump: RT for Brendon Urie
Fav for Donald Trump https://t.co/PZ5vS94lOg
RT @trapgrampa: This is how I see #Trump every time he speaks. https://t.co/fYSiHNS0nT
RT @trumpresearch: @WaitingInBagdad @thehill Trump supporters have selective amnisia.
RT @Pjw20161951: NO KIDDING: #SleazyDonald just attacked Scott Walker for NOT RAISI

## 3.2 Adding try-except block

In [10]:
def count_entries(df, col_name='lang'):
    """Return a dictionary with counts of
    occurrences as value for each key."""
    
    cols_count={}
    
    # Add try block
    try:
        col=df[col_name]
        
        for entry in col:
            if entry in cols_count.keys():
                cols_count[entry] += 1
            else:
                cols_count[entry] = 1
        
        return cols_count
    # Add except block
    except:
        print('The DataFrame does not have a ' + col_name + ' column.')

In [11]:
result1 = count_entries(tweets_df, 'lang')

print(result1)

{'und': 2, 'en': 97, 'et': 1}


In [12]:
result2 = count_entries(tweets_df, 'lang1')

The DataFrame does not have a lang1 column.


## 3.3 Raising a ValueError

In [13]:
def count_entries(df, col_name='lang'):
    """Return a dictionary with counts of
    occurrences as value for each key."""
    # Raise a ValueError if col_name is NOT in DataFrame
    if col_name not in df.columns:
        raise ValueError ('The DataFrame does not have a ' + col_name + ' column.')
        
    cols_count = {}

    col = df[col_name]
        
    for entry in col:
        if entry in cols_count.keys():
            cols_count[entry] += 1
        else:
            cols_count[entry] = 1
    return cols_count

In [14]:
result1 = count_entries(tweets_df, 'lang')

print(result1)

{'und': 2, 'en': 97, 'et': 1}


In [15]:
result2 = count_entries(tweets_df, 'lang1')

ValueError: The DataFrame does not have a lang1 column.