<h1>Open connections to a file with the context manager:</h1>

Syntax:
<b>With open('datacamp.csv') as datacamp</b>

This binds the csv file 'datacamp.csv' as datacamp in the context manager. Here.

The with statement is the context manager, and its purpose is to ensure that resources are efficiently allocated when opening a connection to a file

This concept of lazy evaluation is useful when you have to deal with very large datasets because it lets you generate values in an efficient manner by yielding only chunks of data at a time instead of the whole thing at once

In [13]:
# Define read_large_file()
def read_large_file(file_object):
    """A generator function to read a large file lazily."""

    # Loop indefinitely until the end of the file
    while True:

        # Read a line from the file: data
        data = file_object.readline()

        # Break if this is the end of the file
        if not data:
            break

        # Yield the line of data
        yield data
        
# Open a connection to the file
with open('tweets.csv') as file:

    # Create a generator object for the file: gen_file
    gen_file = read_large_file(file)

    # Print the first three lines of the file
    print(next(gen_file))
    print(next(gen_file))
    print(next(gen_file))

contributors,coordinates,created_at,entities,extended_entities,favorite_count,favorited,filter_level,geo,id,id_str,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,is_quote_status,lang,place,possibly_sensitive,quoted_status,quoted_status_id,quoted_status_id_str,retweet_count,retweeted,retweeted_status,source,text,timestamp_ms,truncated,user

,,Tue Mar 29 23:40:17 +0000 2016,"{'hashtags': [], 'user_mentions': [{'screen_name': 'bpolitics', 'name': 'Bloomberg Politics', 'id': 564111558, 'id_str': '564111558', 'indices': [3, 13]}, {'screen_name': 'krollbondrating', 'name': 'Kroll Bond Ratings', 'id': 1963523857, 'id_str': '1963523857', 'indices': [16, 32]}], 'symbols': [], 'media': [{'sizes': {'large': {'w': 1024, 'h': 691, 'resize': 'fit'}, 'medium': {'w': 600, 'h': 405, 'resize': 'fit'}, 'small': {'w': 340, 'h': 229, 'resize': 'fit'}, 'thumb': {'w': 150, 'h': 150, 'resize': 'crop'}}, 'expanded_url': 'http://twitter.com/bp

In [28]:
# Import the pandas package
import pandas as pd

# Initialize reader object: df_reader
df_reader = pd.read_csv('tweets.csv', chunksize=10)#It returns an iterable reader object

# Print two chunks
print(next(df_reader))
print(next(df_reader))

   contributors  coordinates                      created_at  \
0           NaN          NaN  Tue Mar 29 23:40:17 +0000 2016   
1           NaN          NaN  Tue Mar 29 23:40:17 +0000 2016   
2           NaN          NaN  Tue Mar 29 23:40:17 +0000 2016   
3           NaN          NaN  Tue Mar 29 23:40:17 +0000 2016   
4           NaN          NaN  Tue Mar 29 23:40:17 +0000 2016   
5           NaN          NaN  Tue Mar 29 23:40:17 +0000 2016   
6           NaN          NaN  Tue Mar 29 23:40:18 +0000 2016   
7           NaN          NaN  Tue Mar 29 23:40:17 +0000 2016   
8           NaN          NaN  Tue Mar 29 23:40:18 +0000 2016   
9           NaN          NaN  Tue Mar 29 23:40:18 +0000 2016   

                                            entities  \
0  {'hashtags': [], 'user_mentions': [{'screen_na...   
1  {'hashtags': [{'text': 'cruzsexscandal', 'indi...   
2  {'hashtags': [], 'user_mentions': [], 'symbols...   
3  {'hashtags': [], 'user_mentions': [], 'symbols...   
4  {'hashtags':

In [29]:
# Initialize reader object: tweets_reader
tweets_reader = pd.read_csv('tweets.csv', chunksize=1000)

# Get the first DataFrame chunk: df_tweets
df_tweets = next(tweets_reader)

# Check out the head of the DataFrame
print(df_tweets.head())

# Check out specific language: df_tweets_en
df_tweets_en = df_tweets[df_tweets['lang'] == 'en']

# Zip DataFrame columns of interest: pops
pops = zip(df_tweets_en['text'], df_tweets_en['id'])

# Turn zip object into list: pops_list
pops_list = list(pops)

# Print pops_list
print(pops_list)

   contributors  coordinates                      created_at  \
0           NaN          NaN  Tue Mar 29 23:40:17 +0000 2016   
1           NaN          NaN  Tue Mar 29 23:40:17 +0000 2016   
2           NaN          NaN  Tue Mar 29 23:40:17 +0000 2016   
3           NaN          NaN  Tue Mar 29 23:40:17 +0000 2016   
4           NaN          NaN  Tue Mar 29 23:40:17 +0000 2016   

                                            entities  \
0  {'hashtags': [], 'user_mentions': [{'screen_na...   
1  {'hashtags': [{'text': 'cruzsexscandal', 'indi...   
2  {'hashtags': [], 'user_mentions': [], 'symbols...   
3  {'hashtags': [], 'user_mentions': [], 'symbols...   
4  {'hashtags': [], 'user_mentions': [{'screen_na...   

                                   extended_entities  favorite_count  \
0  {'media': [{'sizes': {'large': {'w': 1024, 'h'...               0   
1  {'media': [{'sizes': {'large': {'w': 500, 'h':...               0   
2                                                NaN          