In [1]:
import sqlite3
import pandas as pd
import random
import re
random.seed(1)
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from textblob import Word



In [2]:
db_path = '/Users/boyaliu/Documents/UCD/STA 208/final project/data/so-dump.db'
conn = sqlite3.connect(db_path)

In [3]:
query = "SELECT * FROM posts"

In [4]:
post_df = pd.read_sql_query(query, conn)

In [7]:
anpost_df = post_df[post_df.PostTypeId==2]

In [8]:
anpost_df.shape

(74331, 21)

In [25]:
# clean data
verb_exp = ['VB', 'VBZ', 'VBP', 'VBD','VBN','VBG']
stop_words = pd.read_pickle('/Users/boyaliu/Documents/UCD/STA 208/final project/stop_words.pickle.txt')

def clean_text(row):
    soup = BeautifulSoup(row, 'html.parser')
    #remove code
    for tag in soup.find_all('code'):
        tag.replaceWith(' ')
        
    raw = soup.get_text()
    #remove link
    raw_no_link = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', raw)
    #remove email
    no_link_email = re.sub(r'[\w\.-]+@[\w\.-]+[\.][com|org|ch|uk]{2,3}', "", raw_no_link)
    #remove whitespace
    tab_text = '\t\n\r\x0b\x0c'
    no_link_email_space = "".join([ch for ch in no_link_email if ch not in set(tab_text)])
    #remove fomula
    reg = '(\$.+?\$)|((\\\\begin\{.+?\})(.+?)(\\\\end\{(.+?)\}))'
    raw = re.sub(reg, "", no_link_email_space, flags=re.IGNORECASE)   

    row = raw.lower()
    #remove numbers
    raw = re.sub('[0-9]+?', ' ', row) 
    # remove punctuation
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    raw = regex.sub(' ', raw)
    #clean out the characters left out after the above step, like we’re, I’m, It’s, i.e.
    raw = re.sub('( s )|( re )|( m )|( i e )',' ',raw) 
    # lementize
    row_t = TextBlob(raw)
    raw = []
    for word, pos in row_t.tags:
        if pos in verb_exp:
            word = Word(word)
            word = word.lemmatize("v")
        else:
            word = Word(word)
            word = word.lemmatize()
        raw.append(word)
    clean = ' '.join(raw)       
    # remove stop words
    cleaned_text = " ".join([word for word in word_tokenize(clean) if word not in stop_words])      
    return(cleaned_text)

In [26]:
ans_clean = anpost_df['Body'].map(lambda i: clean_text(i))

In [24]:
ans_clean[0:5].tolist()

['projectr valuable significant widely accept open source alternative big box package mature support standard scientific community reason valuable nice tutorial',
 'incanter clojure base platform environment library statistical compute graphic',
 'response datasets run statistical analysis reference datasets',
 'machine learn basis pragmatic practical observation simulation reality statistic mindless check model assumption lead discard method year ago commercially work bankruptcy model implement credit bureau create plain linear regression model target outcome technically bad approach practically work',
 'jay valuable short list reason check ggplot nice graphic package nice tutorial']

# Stack Overflow Metrics

In [None]:
# count the space
space_count = ans_clean.map(lambda i: i.count(' '))

In [None]:
def case_count(row):
    # count lower_case percentage & upper_case percentage
    up = []
    row_nw = row.replace(" ", "")
    for c in row:
        if c.isupper():
            up.append(1)
            
    upper_per = sum(up)/len(row_nw)    
    lower_per = 1 - upper_per
    return((lower_per, upper_per))

In [None]:
# count the case 
c_c = anpost_df.Body.map(lambda i: case_count(i))
upper_per = c_c.map(lambda i: i[1])
lower_per = c_c.map(lambda i: i[0])

In [None]:
# count the url
def url_count(row):
    url = []
    soup = BeautifulSoup(row, "html.parser")
    raw = BeautifulSoup(row,"html.parser").get_text()
    for a in soup.find_all('a', href=True):
        url.append(a['href'])
    url_re = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', raw)
    for i in url_re:
        url.append(i)
    return(len(set(url)))

In [None]:
# count the number of url
url_count = anpost_df.Body.map(lambda i: url_count(i))

In [None]:
# calculate the body length
body_len = anpost_df.Body.str.len()

In [None]:
# sentiment analysis
def sentiment_ana(i):
    i_tr = TextBlob(i)
    return(i_tr.sentiment[0], i_tr.sentiment[1])

In [None]:
sentiment_ans = ans_clean.map(lambda i: sentiment_ana(i))

# Readablity Matrix

## clean body

In [None]:
def exact_from_html(row, puc=0):
    soup = BeautifulSoup(row, "html.parser")

    #remove code
    #chunck_num = len(soup.find_all('code'))
    #i=1
    #while True:
      #  if i <= chunck_num:
            #soup.code.extract()
            #i = i + 1
        #else:
            #break
    for tag in soup.find_all('code'):
        tag.replaceWith(' ')
    ######
    raw = soup.get_text()
    #remove link
    raw_no_link = re.sub("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", '', raw)
    #remove email
    no_link_email = re.sub(r'[\w\.-]+@[\w\.-]+[\.][com|org|ch|uk]{2,3}', "", raw_no_link)
    #remove whitespace
    tab_text = '\t\n\r\x0b\x0c'
    no_link_email_space = "".join([ch for ch in no_link_email if ch not in set(tab_text)])
    #remove fomula
    reg = '(\$.+?\$)|((\\\\begin\{.+?\})(.+?)(\\\\end\{(.+?)\}))'
    raw = re.sub(reg, "", no_link_email_space, flags=re.IGNORECASE)
    #remove numbers
    raw = re.sub('[0-9]+?', ' ', raw)
    #remove punctuation
    #if puc == 1:
     #   raw = "".join([ch for ch in raw if ch not in set(string.punctuation)])
    # remove stop words
    #raw = " ".join([word for word in word_tokenize(raw.lower()) if word not in stopwords.words('english')])
    return(raw)

In [None]:
fun_clean = lambda i: exact_from_html(i)
Body_clean_puc = anpost_df['Body'].map(fun_clean)

In [None]:
Body_clean_puc.head()

In [None]:
import pickle
with open('Body_clean_puc.pickle', 'wb') as handle:
    pickle.dump(Body_clean_puc, handle,protocol=2)

## Exact code and formula

In [None]:
fomula_num = []
Code = []
def exact_formula(row):
    soup = BeautifulSoup(row, "html.parser")
    #remove code
    chunck_num = len(soup.find_all('code'))
    i=1
    while True:
        if chunck_num == 0:
            Code.append(0)
            break
        elif i <= chunck_num:
            code_num = 0
            code = soup.code.get_text()
            code = code.split('\n')
            codes = [x for x in code if x != '']
            code_num = code_num + len(codes)
            soup.code.extract()
            i = i + 1
        else:
            Code.append(code_num)
            break

    ######
    raw = soup.get_text()
    #remove link
    raw_no_link = re.sub("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", '', raw)
    #remove email
    no_link_email = re.sub(r'[\w\.-]+@[\w\.-]+[\.][com|org|ch|uk]{2,3}', "", raw_no_link)
    #remove whitespace
    tab_text = '\t\n\r\x0b\x0c'
    no_link_email_space = "".join([ch for ch in no_link_email if ch not in set(tab_text)])
    # remove fomula
    reg = '(\$.+?\$)|((\\\\begin\{.+?\})(.+?)(\\\\end\{(.+?)\}))'
    re_fomula = re.findall(reg, no_link_email_space, flags=re.IGNORECASE)
    if re_fomula != []:
        for item in re_fomula:
            no_link_email_space = no_link_email_space.replace(str(item), '')
        fomula_num.append(int(1))
    else:
        fomula_num.append(int(0))

In [None]:
fun_clean = lambda i: exact_formula(i)
get_result = anpost_df['Body'].map(fun_clean)

In [None]:
Code_df = pd.Series(data=Code)
Code_df.describe()

In [None]:
fomula_num_df = pd.Series(data=fomula_num)
fomula_num_df.value_counts()

In [None]:
Body_clean_puc_df = pd.DataFrame(Body_clean_puc)
Body_clean_puc_df['Code number'] = Code_df
Body_clean_puc_df['formula'] = fomula_num_df
Body_clean_puc_df.head()

In [None]:
qpost_df['Code number'] = Code_df
qpost_df['formula'] = fomula_num_df

## Word Counts

In [None]:
# words count
def word_num(text):
    text = "".join(x for x in text if x not in list(set(string.punctuation)))
    text = word_tokenize(text)
    return (len(text))

In [None]:
Words_num = Body_clean_puc_df['Body'].apply(word_num)
Words_num_df = pd.Series(data=Words_num)
Words_num_df.value_counts().head()

## Sentence Counts


In [None]:
# sentence count
def sent_num(text):
    return (len(sent_tokenize(text)))

In [None]:
Sents_num = Body_clean_puc_df['Body'].apply(sent_num)
Sents_num_df = pd.Series(data=Sents_num)
Sents_num_df.value_counts().head()

## Syllables Count

In [None]:

# syllables count The "Written Method" Rules:http://www.howmanysyllables.com/howtocountsyllables
def syllable_num(text):
    count = 0
    vowels = 'aeiouy'
    text = text.lower()
    text = "".join(x for x in text if x not in list(set(string.punctuation)))

    if text == None:
        return 0
    elif len(text) == 0:
        return 0
    else:
        if text[0] in vowels: 
            count += 1
        for index in range(1, len(text)):
            if text[index] in vowels and text[index-1] not in vowels:
                count += 1
        if text.endswith('e'): 
            count -= 1
        if text.endswith('le'):
            count += 1
        if text.endswith('les'):
            count += 1
        if count == 0:
            count += 1
    return count

## polysyllables count

In [None]:
# polysyllables count
def polysyllab_num(text):
    count = 0
    for word in text.split():
        wrds = syllable_num(word)
        if wrds >= 3:
            count += 1
    return count

## Characters Count

In [None]:
# characters count
def char_num(text, ignore_spaces=True):
    if ignore_spaces:
        text = text.replace(" ", "")
    text = "".join(x for x in text if x not in list(set(string.punctuation)))
    return len(text)

## Complex Words

In [None]:
easy_words = """a able aboard about above absent accept accident account ache aching acorn acre across act acts add address admire adventure afar	afraid after afternoon afterward afterwards again against age aged ago agree ah ahead aid aim air airfield airplane airport airship airy	alarm alike alive all alley alligator allow almost alone along aloud already also always am America American among amount an and	angel anger angry animal another answer ant any anybody anyhow anyone anything anyway anywhere apart apartment ape apiece appear apple April	apron are aren't arise arithmetic arm armful army arose around arrange arrive arrived arrow art artist as ash ashes aside ask	asleep at ate attack attend attention August aunt author auto automobile autumn avenue awake awaken away awful awfully awhile ax axe baa babe babies back background backward backwards bacon bad badge badly bag bake baker bakery baking ball balloon banana band bandage bang banjo bank banker bar barber bare barefoot barely bark barn barrel base baseball basement basket bat batch bath bathe bathing bathroom bathtub	battle battleship bay be beach bead beam bean bear beard beast beat beating beautiful beautify beauty became because become becoming bed bedbug bedroom bedspread bedtime bee beech beef beefsteak beehive been beer beet before beg began beggar begged begin beginning begun behave behind being	believe bell belong below belt bench bend beneath bent berries berry beside besides best bet better between bib bible bicycle bid big bigger bill billboard bin bind bird birth birthday biscuit bit bite biting bitter black blackberry blackbird blackboard blackness blacksmith blame blank blanket	blast blaze bleed bless blessing blew blind blindfold blinds block blood bloom blossom blot blow blue blueberry bluebird blush board boast boat bob bobwhite bodies body boil boiler bold bone bonnet boo book bookcase bookkeeper boom boot born borrow boss both bother bottle bottom	bought bounce bow bowl bow-wow box boxcar boxer boxes boy boyhood bracelet brain brake bran branch brass brave bread break breakfast breast breath breathe breeze brick bride bridge bright brightness bring broad broadcast broke broken brook broom brother brought brown brush bubble bucket buckle	bud buffalo bug buggy build building built bulb bull bullet bum bumblebee bump bun bunch bundle bunny burn burst bury bus bush bushel business busy but butcher butt butter buttercup butterfly buttermilk butterscotch button buttonhole buy buzz by bye cab cabbage cabin cabinet cackle cage cake calendar calf call caller calling came camel camp campfire can canal canary candle candlestick candy cane cannon cannot canoe can't canyon cap cape capital captain car card cardboard care careful careless carelessness carload carpenter carpet carriage carrot carry cart	carve case cash cashier castle cat catbird catch catcher caterpillar catfish catsup cattle caught cause cave ceiling cell cellar cent center cereal certain certainly chain chair chalk champion chance change chap charge charm chart chase chatter cheap cheat check checkers cheek cheer cheese cherry chest chew	chick chicken chief child childhood children chill chilly chimney chin china chip chipmunk chocolate choice choose chop chorus chose chosen christen Christmas church churn cigarette circle circus citizen city clang clap class classmate classroom claw clay clean cleaner clear clerk clever click cliff climb clip cloak	clock close closet cloth clothes clothing cloud cloudy clover clown club cluck clump coach coal coast coat cob cobbler cocoa coconut cocoon cod codfish coffee coffeepot coin cold collar college color colored colt column comb come comfort comic coming company compare conductor cone connect coo cook	cooked cooking cookie cookies cool cooler coop copper copy cord cork corn corner correct cost cot cottage cotton couch cough could couldn't count counter country county course court cousin cover cow coward cowardly cowboy cozy crab crack cracker cradle cramps cranberry crank cranky crash crawl crazy	cream creamy creek creep crept cried croak crook crooked crop cross crossing cross-eyed crow crowd crowded crown cruel crumb crumble crush crust cry cries cub cuff cup cuff cup cupboard cupful cure curl curly curtain curve cushion custard customer cut cute cutting dab dad daddy daily dairy daisy dam damage dame damp dance dancer dancing dandy danger dangerous dare dark darkness darling darn dart dash date daughter	dawn day daybreak daytime dead deaf deal dear death December decide deck deed deep deer defeat defend defense delight den dentist depend deposit describe desert	deserve desire desk destroy devil dew diamond did didn't die died dies difference different dig dim dime dine ding-dong dinner dip direct direction dirt dirty	discover dish dislike dismiss ditch dive diver divide do dock doctor does doesn't dog doll dollar dolly done donkey don't door doorbell doorknob doorstep dope	dot double dough dove down downstairs downtown dozen drag drain drank draw drawer draw drawing dream dress dresser dressmaker drew dried drift drill drink drip	drive driven driver drop drove drown drowsy drub drum drunk dry duck due dug dull dumb dump during dust dusty duty dwarf dwell dwelt dying each eager eagle ear early earn earth east eastern easy eat eaten	edge egg eh eight eighteen eighth eighty either elbow elder eldest electric	electricity elephant eleven elf elm else elsewhere empty end ending enemy engine	engineer English enjoy enough enter envelope equal erase eraser errand escape eve	even evening ever every everybody everyday everyone everything everywhere evil exact except	exchange excited exciting excuse exit expect explain extra eye eyebrow fable face facing fact factory fail faint fair fairy faith fake fall false family fan fancy far faraway fare farmer farm farming far-off farther fashion fast fasten fat father	fault favor favorite fear feast feather February fed feed feel feet fell fellow felt fence fever few fib fiddle field fife fifteen fifth fifty fig fight figure file fill	film finally find fine finger finish fire firearm firecracker fireplace fireworks firing first fish fisherman fist fit fits five fix flag flake flame flap flash flashlight flat flea flesh	flew flies flight flip flip-flop float flock flood floor flop flour flow flower flowery flutter fly foam fog foggy fold folks follow following fond food fool foolish foot football	footprint for forehead forest forget forgive forgot forgotten fork form fort forth fortune forty forward fought found fountain four fourteen fourth fox frame free freedom freeze freight French fresh	fret Friday fried friend friendly friendship frighten frog from front frost frown froze fruit fry fudge fuel full fully fun funny fur furniture further fuzzy gain gallon gallop game gang garage garbage garden gas gasoline gate gather gave gay gear geese general gentle gentleman gentlemen	geography get getting giant gift gingerbread girl give given giving glad gladly glance glass glasses gleam glide glory glove glow	glue go going goes goal goat gobble God god godmother gold golden goldfish golf gone good goods goodbye good-by goodbye	good-bye good-looking goodness goody goose gooseberry got govern government gown grab gracious grade grain grand grandchild grandchildren granddaughter grandfather grandma	grandmother grandpa grandson grandstand grape grapes grapefruit grass grasshopper grateful grave gravel graveyard gravy gray graze grease great green greet	grew grind groan grocery ground group grove grow guard guess guest guide gulf gum gun gunpowder guy ha habit had hadn't hail hair haircut hairpin half hall halt ham hammer hand handful handkerchief handle handwriting hang happen happily happiness happy harbor hard hardly hardship hardware hare hark	harm harness harp harvest has hasn't haste hasten hasty hat hatch hatchet hate haul have haven't having hawk hay hayfield haystack he head headache heal health healthy heap hear hearing	heard heart heat heater heaven heavy he'd heel height held hell he'll hello helmet help helper helpful hem hen henhouse her hers herd here here's hero herself he's hey hickory	hid hidden hide high highway hill hillside hilltop hilly him himself hind hint hip hire his hiss history hit hitch hive ho hoe hog hold holder hole holiday hollow holy	home homely homesick honest honey honeybee honeymoon honk honor hood hoof hook hoop hop hope hopeful hopeless horn horse horseback horseshoe hose hospital host hot hotel hound hour house housetop	housewife housework how however howl hug huge hum humble hump hundred hung hunger hungry hunk hunt hunter hurrah hurried hurry hurt husband hush hut hymn I ice icy I'd idea ideal if ill	I'll I'm important impossible improve in inch inches	income indeed Indian indoors ink inn insect inside	instant instead insult intend interested interesting into invite	iron is island isn't it its it's itself	I've ivory ivy jacket jacks jail jam January jar	jaw jay jelly jellyfish jerk jig	job jockey join joke joking jolly	journey joy joyful joyous judge jug	juice juicy July jump June junior	junk just keen keep kept kettle key	kick kid kill killed kind	kindly kindness king kingdom kiss	kitchen kite kitten kitty knee	kneel knew knife knit knives	knob knock knot know known lace lad ladder ladies lady laid lake lamb lame lamp land lane language lantern lap lard large lash lass last	late laugh laundry law lawn lawyer lay lazy lead leader leaf leak lean leap learn learned least leather leave leaving	led left leg lemon lemonade lend length less lesson let let's letter letting lettuce level liberty library lice lick lid	lie life lift light lightness lightning like likely liking lily limb lime limp line linen lion lip list listen lit	little live lives lively liver living lizard load loaf loan loaves lock locomotive log lone lonely lonesome long look lookout	loop loose lord lose loser loss lost lot loud love lovely lover low luck lucky lumber lump lunch lying machine machinery mad made magazine magic maid mail mailbox mailman major make making male mama mamma man manager mane manger many map	maple marble march March mare mark market marriage married marry mask mast master mat match matter mattress may May maybe mayor maypole me	meadow meal mean means meant measure meat medicine meet meeting melt member men mend meow merry mess message met metal mew mice middle	midnight might mighty mile milk milkman mill miler million mind mine miner mint minute mirror mischief miss Miss misspell mistake misty mitt mitten	mix moment Monday money monkey month moo moon moonlight moose mop more morning morrow moss most mostly mother motor mount mountain mouse mouth	move movie movies moving mow Mr. Mrs. much mud muddy mug mule multiply murder music must my myself nail name nap napkin narrow nasty naughty navy near nearby	nearly neat neck necktie need needle needn't Negro neighbor neighborhood	neither nerve nest net never nevermore new news newspaper next	nibble nice nickel night nightgown nine nineteen ninety no nobody	nod noise noisy none noon nor north northern nose not	note nothing notice November now nowhere number nurse nut oak oar oatmeal oats obey ocean o'clock October odd of off	offer office officer often oh oil old old-fashioned on once one	onion only onward open or orange orchard order ore organ other	otherwise ouch ought our ours ourselves out outdoors outfit outlaw outline	outside outward oven over overalls overcoat overeat overhead overhear overnight overturn	owe owing owl own owner ox pa pace pack package pad page paid pail pain painful paint painter painting pair pal palace pale pan pancake pane pansy pants papa paper parade pardon parent park part partly partner party	pass passenger past paste pasture pat patch path patter pave pavement paw pay payment pea peas peace peaceful peach peaches peak peanut pear pearl peck peek peel peep peg pen pencil penny	people pepper peppermint perfume perhaps person pet phone piano pick pickle picnic picture pie piece pig pigeon piggy pile pill pillow pin pine pineapple pink pint pipe pistol pit pitch pitcher pity	place plain plan plane plant plate platform platter play player playground playhouse playmate plaything pleasant please pleasure plenty plow plug plum pocket pocketbook poem point poison poke pole police policeman polish polite	pond ponies pony pool poor pop popcorn popped porch pork possible post postage postman pot potato potatoes pound pour powder power powerful praise pray prayer prepare present pretty price prick prince princess	print prison prize promise proper protect proud prove prune public puddle puff pull pump pumpkin punch punish pup pupil puppy pure purple purse push puss pussy pussycat put putting puzzle quack quart	quarter queen	queer question	quick quickly	quiet quilt	quit quite rabbit race rack radio radish rag rail railroad railway rain rainy rainbow raise raisin rake ram ran ranch rang rap rapidly	rat rate rather rattle raw ray reach read reader reading ready real really reap rear reason rebuild receive recess record red	redbird redbreast refuse reindeer rejoice remain remember remind remove rent repair repay repeat report rest return review reward rib ribbon rice	rich rid riddle ride rider riding right rim ring rip ripe rise rising river road roadside roar roast rob robber robe	robin rock rocky rocket rode roll roller roof room rooster root rope rose rosebud rot rotten rough round route row rowboat	royal rub rubbed rubber rubbish rug rule ruler rumble run rung runner running rush rust rusty rye sack sad saddle sadness safe safety said sail sailboat sailor saint salad sale salt same sand sandy sandwich sang sank sap sash sat satin satisfactory Saturday sausage savage save savings saw say scab scales scare scarf school schoolboy schoolhouse schoolmaster schoolroom scorch score scrap scrape scratch scream screen screw scrub sea seal seam search season seat second secret see seeing seed seek seem seen seesaw select self selfish	sell send sense sent sentence separate September servant serve service set setting settle settlement seven seventeen seventh seventy several sew shade shadow shady shake shaker shaking shall shame shan't shape share sharp shave she she'd she'll she's shear shears shed sheep sheet shelf shell shepherd shine shining shiny ship shirt shock shoe shoemaker shone shook shoot shop shopping shore short shot should shoulder shouldn't shout shovel show shower	shut shy sick sickness side sidewalk sideways sigh sight sign silence silent silk sill silly silver simple sin since sing singer single sink sip sir sis sissy sister sit sitting six sixteen sixth sixty size skate skater ski skin skip skirt sky slam slap slate slave sled sleep sleepy sleeve sleigh slept slice slid slide sling slip slipped slipper slippery slit slow slowly sly smack small smart smell	smile smoke smooth snail snake snap snapping sneeze snow snowy snowball snowflake snuff snug so soak soap sob socks sod soda sofa soft soil sold soldier sole some somebody somehow someone something sometime sometimes somewhere son song soon sore sorrow sorry sort soul sound soup sour south southern space spade spank sparrow speak speaker spear speech speed spell spelling spend spent spider spike spill spin spinach spirit spit	splash spoil spoke spook spoon sport spot spread spring springtime sprinkle square squash squeak squeeze squirrel stable stack stage stair stall stamp stand star stare start starve state station stay steak steal steam steamboat steamer steel steep steeple steer stem step stepping stick sticky stiff still stillness sting stir stitch stock stocking stole stone stood stool stoop stop stopped stopping store stork stories storm stormy story stove straight	strange stranger strap straw strawberry stream street stretch string strip stripes strong stuck study stuff stump stung subject such suck sudden suffer sugar suit sum summer sun Sunday sunflower sung sunk sunlight sunny sunrise sunset sunshine supper suppose sure surely surface surprise swallow swam swamp swan swat swear sweat sweater sweep sweet sweetness sweetheart swell swept swift swim swimming swing switch sword swore table tablecloth tablespoon tablet tack tag tail tailor take taken taking tale talk talker tall tame tan tank tap tape tar tardy task taste taught tax tea teach teacher team tear	tease teaspoon teeth telephone tell temper ten tennis tent term terrible test than thank thanks thankful Thanksgiving that that's the theater thee their them then there these they they'd they'll they're	they've thick thief thimble thin thing think third thirsty thirteen thirty this thorn those though thought thousand thread three threw throat throne through throw thrown thumb thunder Thursday thy tick ticket	tickle tie tiger tight till time tin tinkle tiny tip tiptoe tire tired title to toad toadstool toast tobacco today toe together toilet told tomato tomorrow ton tone tongue tonight too	took tool toot tooth toothbrush toothpick top tore torn toss touch tow toward towards towel tower town toy trace track trade train tramp trap tray treasure treat tree trick tricycle tried	trim trip trolley trouble truck true truly trunk trust truth try tub Tuesday tug tulip tumble tune tunnel turkey turn turtle twelve twenty twice twig twin two ugly umbrella uncle under understand underwear	undress unfair unfinished unfold unfriendly unhappy	unhurt uniform United States unkind unknown	unless unpleasant until unwilling up upon	upper upset upside upstairs uptown upward	us use used useful valentine valley valuable	value vase vegetable	velvet very vessel	victory view village	vine violet visit	visitor voice vote wag wagon waist wait wake waken walk wall walnut want war warm warn was wash washer washtub wasn't waste watch watchman water watermelon waterproof wave wax	way wayside we weak weakness weaken wealth weapon wear weary weather weave web we'd wedding Wednesday wee weed week we'll weep weigh welcome well went were	we're west western wet we've whale what what's wheat wheel when whenever where which while whip whipped whirl whisky whiskey whisper whistle white who who'd whole	who'll whom who's whose why wicked wide wife wiggle wild wildcat will willing willow win wind windy windmill window wine wing wink winner winter wipe wire	wise wish wit witch with without woke wolf woman women won wonder wonderful won't wood wooden woodpecker woods wool woolen word wore work worker workman world	worm worn worry worse worst worth would wouldn't wound wove wrap wrapped wreck wren wring write writing written wrong wrote wrung yard yarn year yell	yellow yes yesterday yet	yolk yonder you you'd	you'll young youngster your	yours you're yourself yourselves	youth you've"""
easy_word_set = set(easy_words.split())
def complex_words_num(text):
	text_list = text.split()
	diff_words_set = set()
	for value in text_list:
		if value not in easy_word_set:
			if syllable_num(value) > 1:
				if value not in diff_words_set:
					diff_words_set.add(value)
	return len(diff_words_set)

## Some functions

In [None]:
def word_sent(text): #avg_sentence_length
    lc = word_num(text)
    sc = sent_num(text)
    return float(lc)/float(sc)


def syll_word(text): #avg_syllables_per_word
    syllable = syllable_num(text)
    words = word_num(text)
    ASPW = float(syllable)/float(words)
    return ASPW



def char_word(text): #avg_letter_per_word
    ALPW = float(float(char_num(text))/float(word_num(text)))
    return ALPW



def word_sent(text):
    AWPS = float(float(word_num(text))/float(sent_num(text)))
    return AWPS

def sent_word(text): #avg_sentence_per_word
    ASPW = float(float(sent_num(text))/float(word_num(text)))
    return ASPW

## Automated Reading Index

In [None]:
def Automated_Readability_Index(text):
    try:
        a = char_word(text)
        b = word_sent(text)
        ARI = (4.71 * round(a, 2)) + (0.5*round(b, 2)) - 21.43
        return round(ARI, 2)
    except:
        return 'NaN'

In [None]:
ARI = Body_clean_puc_df['Body'].apply(Automated_Readability_Index)

In [None]:
ARI.value_counts().head()

## Coleman Liau Index

In [None]:
def Coleman_Liau_Index(text):
    try:
        L = char_word(text)*100
        S = sent_word(text)*100
        CLI = float((0.058 * L) - (0.296 * S) - 15.8)
        return round(CLI, 2)
    except:
        return 'NaN'

In [None]:
Coleman_Liau_Index(test_data)

In [None]:
CLI = Body_clean_puc_df['Body'].apply(Coleman_Liau_Index)

In [None]:
CLI.value_counts().head()

## Flesch Kincaid Grade Level

In [None]:
def Flesch_Kincaid_Grade(text):
    try:
        ASL = word_sent(text)
        ASW = syll_word(text)
        FKRA = float(0.39 * ASL) + float(11.8 * ASW) - 15.59
        return round(FKRA, 2)
    except:
        return 'NaN'

In [None]:
FKG = Body_clean_puc_df['Body'].apply(Flesch_Kincaid_Grade)

In [None]:
FKG.value_counts().head()

## Flesch reading ease

In [None]:
def Flesch_Reading_Ease(text):
    try:
        ASL = word_sent(text)
        ASW = syll_word(text)
        FRE = 206.835 - float(1.015 * ASL) - float(84.6 * ASW)
        return round(FRE, 2)
    except:
        return 'NaN'

In [None]:
FRE = Body_clean_puc_df['Body'].apply(Flesch_Reading_Ease)


In [None]:
FRE.value_counts().head()

## Gunning fog index

In [None]:
def Gunning_Fog(text):
    try:
        per_diff_words = (complex_words_num(test_data)/word_num(text)*100)
        grade = 0.4*(word_sent(text) + per_diff_words)
        return round(grade,2)
    except:
        return 'NaN'

In [None]:
GFI = Body_clean_puc_df['Body'].apply(Gunning_Fog)

In [None]:
GFI.value_counts().head()

## Metric Entropy

In [None]:
def Metric_Entropy(original): #original body with tags
    try:
        sentropy = entropy.shannon_entropy(original)
        body_length = len(original)
        ME = float(sentropy/body_length)
        return ME
    except:
        return 'NaN'

In [None]:
ME = anpost_df['Body'].apply(Metric_Entropy)

In [None]:
ME.value_counts().head()

## LOC Percentage

In [None]:
def LOC_ptg(data):
    try:
        LOC = data['Code number']
        LOB = sent_num(data['Body'])
        LP = float(LOC/(LOC+LOB))
        return LP
    except:
        return 'NaN'

In [None]:
LOC_Per = anpost_df.apply(LOC_ptg,axis=1)

In [None]:
LOC_Per.value_counts().head()

## Generate pickle

In [None]:
Body_clean_puc_df['Id'] = qpost_df['Id']
Body_clean_puc_df['word_count'] = Words_num_df
Body_clean_puc_df['sentence_count'] = Sents_num_df
Body_clean_puc_df['ARI'] = ARI
Body_clean_puc_df['CLI'] = CLI
Body_clean_puc_df['FKG'] = FKG
Body_clean_puc_df['FRE'] = FRE
Body_clean_puc_df['GFI'] = GFI
Body_clean_puc_df['M_Entropy'] = ME
Body_clean_puc_df['LOC_Per'] = LOC_Per
Body_clean_puc_df['Score'] = qpost_df['Score']

In [None]:
Body_clean_puc_df.head()

In [None]:
import pickle
with open('Readability_Metrics.pickle', 'wb') as handle:
    pickle.dump(Body_clean_puc_df, handle,protocol=2)

In [None]:
target = ['Id','Code number','formula','word_count','sentence_count','ARI','CLI','FKG','FRE','GFI','M_Entropy','LOC_Per','Score']
RM = Body_clean_puc_df[target]
RM.head()

In [None]:
import pickle
with open('Readability_Metrics_wtbody.pickle', 'wb') as handle:
    pickle.dump(RM, handle,protocol=2)

## Popularity Matrix

In [None]:
import datetime
cutoff1 = datetime.datetime(2015, 6, 1)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 
sns.set_style("whitegrid")

In [None]:
anpost_df.CreationDate = pd.to_datetime(anpost_df.CreationDate)

## Accepted by Originator Votes

In [None]:
def getAnsCount(row):
    records = apost_df[(apost_df['OwnerUserId']== row['OwnerUserId']) & (apost_df['CreationDate']< row['CreationDate'])]
    return records.shape[0]

In [None]:
#Accepted by Originator Votes
tt = anpost_df.apply(getAnsCount,axis=1)

In [None]:
sns.distplot(tt, rug=False)

## Approved Edited Suggestions Count

In [None]:
# return all the records for questions posts from posts table
posthist_query = "SELECT PostHistoryTypeId,CreationDate,UserId FROM posthistory"
posthist_df = pd.read_sql_query(posthist_query, conn)
posthist_df.CreationDate = pd.to_datetime(posthist_df.CreationDate)

In [None]:
postsedited = posthist_df[posthist_df['PostHistoryTypeId'].isin([4,5,6])]


In [None]:
def getAppEditsCount(row):    
    records = postsedited[(postsedited['UserId']== row['OwnerUserId']) & (postsedited['CreationDate']< row['CreationDate'])]
    return records.shape[0]

In [None]:
#Approved Edited Suggestions Count
AESC = anpost_df.apply(getAppEditsCount,axis=1)

In [None]:
sns.distplot(AESC, rug=False)

In [None]:
AESC.value_counts().head()

## Total Badge Count

In [None]:
# return all the records from badges table
badges_query = "SELECT Name,Date,UserId FROM badges"
badges_df = pd.read_sql_query(badges_query, conn)
badges_df.Date = pd.to_datetime(badges_df.Date)

In [None]:
badges_df.shape

In [None]:
def getBadgeCount(row):    
    records = badges_df[(badges_df['UserId']== row['OwnerUserId']) & (badges_df['Date']< row['CreationDate'])]
    return records.shape[0]

In [None]:
#Total Badge Count
TBC = anpost_df.apply(getBadgeCount,axis=1)

In [None]:
TBC.value_counts().head()

## Total Close and Deletion Votes

In [None]:
# return all the records from votes table
votes_query = "SELECT VoteTypeId,CreationDate,PostId,UserId FROM votes"
votes_df = pd.read_sql_query(votes_query, conn)
votes_df.CreationDate = pd.to_datetime(votes_df.CreationDate)

In [None]:
votes_df = votes_df[votes_df['VoteTypeId'].isin([6,10])]

In [None]:
def getTCDVCount(row):    
    postid_target = post_df[(post_df['OwnerUserId']== row['OwnerUserId'])&(post_df['CreationDate']< row['CreationDate'])].Id.tolist()
    records = votes_df[(votes_df['PostId'].isin(postid_target)) & (votes_df['CreationDate']< row['CreationDate'])]
    return records.shape[0]

In [None]:
#Total Close and Deletion Votes¶
TCDV = anpost_df.apply(getTCDVCount,axis=1)

In [None]:
TCDV.value_counts()

## Total Up Votes received

In [None]:
# return all the records from votes table
votes_query = "SELECT VoteTypeId,CreationDate,PostId,UserId FROM votes"
votes_df = pd.read_sql_query(votes_query, conn)
votes_df.CreationDate = pd.to_datetime(votes_df.CreationDate)

In [None]:
votes_df = votes_df[votes_df['VoteTypeId']==2]

In [None]:
def getUpVoteCount(row):    
    #get the past post ids from the user
    postid_target = post_df[(post_df['OwnerUserId']== row['OwnerUserId'])&(post_df['CreationDate']< row['CreationDate'])].Id.tolist()
    records = votes_df[(votes_df['PostId'].isin(postid_target)) & (votes_df['CreationDate']< row['CreationDate'])]
    return records.shape[0]

In [None]:
#Total Up Votes by the time of new post
TUVC = anpost_df.apply(getTCDVCount,axis=1)

In [None]:
TUVC.value_counts().head()

## Total Down Votes received¶

In [None]:
# return all the records from votes table
votes_query = "SELECT VoteTypeId,CreationDate,PostId,UserId FROM votes"
votes_df = pd.read_sql_query(votes_query, conn)
votes_df.CreationDate = pd.to_datetime(votes_df.CreationDate)
votes_df = votes_df[votes_df['VoteTypeId']==3]

In [None]:
def getDownVoteCount(row):    
    #get the past post ids from the user
    postid_target = post_df[(post_df['OwnerUserId']== row['OwnerUserId'])&(post_df['CreationDate']< row['CreationDate'])].Id.tolist()
    records = votes_df[(votes_df['PostId'].isin(postid_target)) & (votes_df['CreationDate']< row['CreationDate'])]
    return records.shape[0]

In [None]:
#Total Up Votes by the time of new post
TDVC = anpost_df.apply(getDownVoteCount,axis=1)

In [None]:
TDVC.value_counts().head()

## Total Favorite Votes received

In [None]:
votes_query = "SELECT VoteTypeId,CreationDate,PostId,UserId FROM votes WHERE VoteTypeId==5"
votes_df = pd.read_sql_query(votes_query, conn)
votes_df.CreationDate = pd.to_datetime(votes_df.CreationDate)

In [None]:
def getFavVoteCount(row):    
    #get the past post ids from the user
    postid_target = post_df[(post_df['OwnerUserId']== row['OwnerUserId'])&(post_df['CreationDate']< row['CreationDate'])].Id.tolist()
    records = votes_df[(votes_df['PostId'].isin(postid_target)) & (votes_df['CreationDate']< row['CreationDate'])]
    return records.shape[0]

In [None]:
#Total Up Votes by the time of new post
TFVC = anpost_df.apply(getFavVoteCount,axis=1)

In [None]:
TFVC.value_counts().head()

## Integrate all the Popularity Metrics features

In [None]:
anpost_df['AccpAnsCnt'] = tt
anpost_df['AppEditCnt'] = AESC
anpost_df['BadgeCnt'] = TBC
anpost_df['CloDelVCnt'] = TCDV
anpost_df['UpVoteCnt'] = TUVC
anpost_df['DownVoteCnt'] = TDVC
anpost_df['FavVoteCnt'] = TFVC

In [None]:
target_cols = ['Id','CreationDate','OwnerUserId','AccpAnsCnt','AppEditCnt','BadgeCnt','CloDelVCnt','UpVoteCnt',\
               'DownVoteCnt','FavVoteCnt','Score']

In [None]:
popularity_metric_df = qpost_df[target_cols]

In [None]:
popularity_metric_df.head(10)