# 03_02: Loading Text Files

In [2]:
import math
import collections

import numpy as np
import pandas as pd
import matplotlib.pyplot as pp

%matplotlib inline

In [3]:
# iterating over an open file yields its lines, one by one

words = []
for line in open('words.txt', 'r'):
    words.append(line)

In [4]:
len(words)

235886

In [5]:
words[:10]

['A\n',
 'a\n',
 'aa\n',
 'aal\n',
 'aalii\n',
 'aam\n',
 'Aani\n',
 'aardvark\n',
 'aardwolf\n',
 'Aaron\n']

In [6]:
'Aaron\n'.strip()

'Aaron'

In [7]:
'Aaron\n'.strip().lower()

'aaron'

In [9]:
words = []
for line in open('words.txt', 'r'):
    words.append(line.strip().lower())

In [10]:
words[:10]

['a',
 'a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'aani',
 'aardvark',
 'aardwolf',
 'aaron']

In [11]:
words = set()
for line in open('words.txt', 'r'):
    words.add(line.strip().lower())

In [12]:
# a set comprehension that collects stripped and lowercased lines...
words = {line.strip().lower() for line in open('words.txt', 'r')}

In [13]:
words

{'leucomelanous',
 'overinclination',
 'tremolando',
 'brickkiln',
 'unadventurous',
 'wisp',
 'phenomenalism',
 'griffinesque',
 'saccharomycetales',
 'juneberry',
 'chronological',
 'dishonorable',
 'incorrupt',
 'levyist',
 'linchpinned',
 'intragastric',
 'ensigncy',
 'arteriocapillary',
 'alumish',
 'localness',
 'camelopardus',
 'filament',
 'victless',
 'uncoachableness',
 'instealing',
 'metatarsal',
 'cryptomerous',
 'rightlessness',
 'pokable',
 'mulligrubs',
 'antimosquito',
 'permeably',
 'curvilinearly',
 'legatorial',
 'cottoid',
 'galeodidae',
 'legless',
 'pancreaticosplenic',
 'uninvoiced',
 'galvanized',
 'gymnura',
 'suppertime',
 'poorwill',
 'intranational',
 'preadvertency',
 'retemptation',
 'shorthorn',
 'doab',
 'wendell',
 'enjoin',
 'colligate',
 'phial',
 'peddlerism',
 'griffade',
 'preacherdom',
 'philatelically',
 'presupreme',
 'bobbiner',
 'renunciative',
 'neoanthropic',
 'kilogram',
 'procombination',
 'consulage',
 'polylaminated',
 'rescription',
 '

In [14]:
# ...turned into a sorted list
words = sorted({line.strip().lower() for line in open('words.txt', 'r')})

In [15]:
words

['a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'aani',
 'aardvark',
 'aardwolf',
 'aaron',
 'aaronic',
 'aaronical',
 'aaronite',
 'aaronitic',
 'aaru',
 'ab',
 'aba',
 'ababdeh',
 'ababua',
 'abac',
 'abaca',
 'abacate',
 'abacay',
 'abacinate',
 'abacination',
 'abaciscus',
 'abacist',
 'aback',
 'abactinal',
 'abactinally',
 'abaction',
 'abactor',
 'abaculus',
 'abacus',
 'abadite',
 'abaff',
 'abaft',
 'abaisance',
 'abaiser',
 'abaissed',
 'abalienate',
 'abalienation',
 'abalone',
 'abama',
 'abampere',
 'abandon',
 'abandonable',
 'abandoned',
 'abandonedly',
 'abandonee',
 'abandoner',
 'abandonment',
 'abanic',
 'abantes',
 'abaptiston',
 'abarambo',
 'abaris',
 'abarthrosis',
 'abarticular',
 'abarticulation',
 'abas',
 'abase',
 'abased',
 'abasedly',
 'abasedness',
 'abasement',
 'abaser',
 'abasgi',
 'abash',
 'abashed',
 'abashedly',
 'abashedness',
 'abashless',
 'abashlessly',
 'abashment',
 'abasia',
 'abasic',
 'abask',
 'abassin',
 'abastardize',
 'abatable',
 'abate',
 'a

In [16]:
# return a list of all lines from an open file 
open('francais.txt', 'r', encoding='latin1').readlines()

['\n',
 'a\n',
 'ab\n',
 'abaissa\n',
 'abaissai\n',
 'abaissaient\n',
 'abaissais\n',
 'abaissait\n',
 'abaissant\n',
 'abaissas\n',
 'abaissasse\n',
 'abaissassent\n',
 'abaissasses\n',
 'abaissassiez\n',
 'abaissassions\n',
 'abaissâmes\n',
 'abaissât\n',
 'abaissâtes\n',
 'abaisse\n',
 'abaissement\n',
 'abaissements\n',
 'abaissent\n',
 'abaisser\n',
 'abaissera\n',
 'abaisserai\n',
 'abaisseraient\n',
 'abaisserais\n',
 'abaisserait\n',
 'abaisseras\n',
 'abaisserez\n',
 'abaisseriez\n',
 'abaisserions\n',
 'abaisserons\n',
 'abaisseront\n',
 'abaisses\n',
 'abaisseur\n',
 'abaisseurs\n',
 'abaissez\n',
 'abaissé\n',
 'abaissée\n',
 'abaissées\n',
 'abaissés\n',
 'abaissèrent\n',
 'abaissiez\n',
 'abaissions\n',
 'abaissons\n',
 'abandon\n',
 'abandonna\n',
 'abandonnai\n',
 'abandonnaient\n',
 'abandonnais\n',
 'abandonnait\n',
 'abandonnant\n',
 'abandonnas\n',
 'abandonnasse\n',
 'abandonnassent\n',
 'abandonnasses\n',
 'abandonnassiez\n',
 'abandonnassions\n',
 'abandonnâmes\