In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
filename = '../data/alice-in-wonderland.txt'

s = Series(open(filename).read().split())
s.head()

0         ﻿The
1      Project
2    Gutenberg
3        EBook
4           of
dtype: object

# Beyond 1

What is the mean of all integers in Alice?

정수 형식의 목록 생성

- 음의 정수는 포함되어 있지 않지만 음의 정수도 고려해야 함.
- 1,000 처럼 쉼표를 사용하는 숫자도 고려함.
    - isdigit() 메서드 대신 match() 메서드와 정규표현식 활용. 필요한 정규표현식은 AI가 잘 만들어줌.

정수 목록

In [3]:
import string
import re

(
    s.str.strip()  # 양쪽 공백만 제거
     .str.strip(string.punctuation.replace('-', '').replace('+', ''))  # -, +는 남겨두고 나머지 문장부호만 제거하고 싶다면
      # 정규식으로: 선택적 부호 + 콤마 포함 숫자 패턴만 남기기
     .loc[lambda s_: s_.str.match(r'^[+-]?\d{1,3}([,]?\d{3})*$')]
)

68          12
69        2006
71       19033
122       1916
10134        1
11164       20
11225       60
11267        4
11300       30
11389       90
11474        3
11706       90
12056        2
12145     2001
12187        3
12189        4
12198        3
12287     4557
12293    99712
12310      809
12312     1500
12318    84116
12319      801
12359        4
12421        1
12423    5,000
12451       50
12616        5
dtype: object

정수들의 평균값

In [4]:
import string
import re

(
    s
    .str
    .strip()  # 양쪽 공백만 제거
    .str.strip(string.punctuation.replace('-', '').replace('+', ''))  # -, +는 남겨두고 나머지 문장부호만 제거하고 싶다면
    # 정규식으로: 선택적 부호 + 콤마 포함 숫자 패턴만 남기기
    .loc[lambda s_: s_.str.match(r'^[+-]?\d{1,3}([,]?\d{3})*$')]
    # 콤마 제거
    .str.replace(',', '', regex=False)
    # 정수로 변환 후 평균
    .astype(int)
    .mean()
)

np.float64(7922.607142857143)

# Beyond 2

What words in Alice don't appear in the dictionary? Which are the five most common such words?

In [5]:
words = {one_word.strip().lower() for one_word in open('../data/words.txt', encoding='utf-8')}

In [6]:
(
    s
    .str.strip(string.punctuation)      # Strip punctuation
    .loc[lambda s_: s_.str.isalpha()]   # Keep only those with letters
    .loc[lambda s_: ~s_.isin(words)]    # Now keep those *not* in the dictionary, and find the most common ones
    .value_counts()
)

Alice            166
I                127
Project           83
The               63
She               36
                ... 
accepting          1
donors             1
offers             1
International      1
statements         1
Name: count, Length: 795, dtype: int64

In [7]:
(
    s
    .str.lower()                       # Convert to lowercase
    .str.strip(string.punctuation)      # Strip punctuation
    .loc[lambda s_: s_.str.isalpha()]   # Keep only those with letters
    .loc[lambda s_: ~s_.isin(words)]    # Now keep those *not* in the dictionary, and find the most common ones
    .value_counts()
)

gutenberg    30
terms        21
began        19
looked       18
heard        17
             ..
accepting     1
paperwork     1
takes         1
storyland     1
includes      1
Name: count, Length: 408, dtype: int64

# Beyond 3

What is the mean number of words per paragraph?

In [8]:
# Read the file into a series by paragraph
s = Series(open(filename, encoding='utf-8').read().split('\n\n'))

# Just use describe to get min, max, and everything else
(
    s
    .str
    .split()
    .str
    .len()
    .describe() 
)

count    393.000000
mean      32.475827
std       32.428415
min        0.000000
25%        7.000000
50%       22.000000
75%       48.000000
max      169.000000
dtype: float64

이상한 나라의 앨리스의 본문만을 대상으로 할 경우:

In [9]:
s = Series(open(filename).read().splitlines())

start = s[s.str.contains("START OF")].index[0] + 1
end   = s[s.str.contains("END OF")].index[0]

s_main = "\n".join(s.loc[start:end])

paragraphs = Series(s_main.split("\n\n"))

(
    paragraphs
    .str.split()
    .str.len()
    .describe()
)

count    332.000000
mean      29.415663
std       29.052127
min        0.000000
25%        6.000000
50%       18.500000
75%       44.000000
max      123.000000
dtype: float64