In [107]:
import nltk
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
from IPython.core.display import HTML
import matplotlib.pyplot as plt
stopwords = nltk.corpus.stopwords.words('english')

In [141]:
def freq(word, tokens):
    return tokens.count(word)

def word_count(tokens):
    return len(tokens)

def tf(word, tokens):
    return (freq(word, tokens) / float(word_count(tokens)))

def getFreqByDoc(doc):
    #get tokens
    tokens = nltk.word_tokenize(str(doc[1]) + ' ' + str(doc[2]))
    
    #get bitokens
    bi_tokens =  nltk.bigrams(tokens)
    bi_tokens = [' '.join(token).lower() for token in bi_tokens]
    bi_tokens = [token for token in bi_tokens if token not in stopwords]

    tokens = [token.lower() for token in tokens if len(token) > 2]
    tokens = [token for token in tokens if token not in stopwords]

    #merge tokens and bi
    alltokens = []
    alltokens.extend(tokens)
    alltokens.extend(bi_tokens)

    wc = word_count(tokens)
    olist = []
    for token in (alltokens):
        f = freq(token, alltokens)
        ilist = [doc[0], token, f, wc, f/float(wc), doc[3], doc[5]]
        olist.append(ilist)
    df = pd.DataFrame(olist, columns=['doc','word', 'frequency', 'word_count', 'tf','source', 'sample_type'])
    return df

def createFreqCSV(sdf, filename):
    mfgDfSampleList = sdf.as_matrix().tolist()
    for mfg in mfgDfSampleList:
        df = getFreqByDoc(mfg)
        with open(filename, 'a') as f:
            df.to_csv(f, header=False , index=False)    
    return

def createFreqDf(sdf):
    #pass a sample df
    #make a list so that it can be iterated
    mfgDfSampleList = sdf.as_matrix().tolist()
    mfgFreqDfList = []
    for mfg in mfgDfSampleList:
        df = getFreqByDoc(mfg)
        mfgFreqDfList.append(df)

    mfgFreqDf = pd.concat(mfgFreqDfList)
    return mfgFreqDf

In [119]:
#reading the abstracts in to a DF
mfgDf = pd.DataFrame.from_csv('../../data/positive_data.csv', index_col=None)
nsfDf = pd.DataFrame.from_csv('../../data/negative_data.csv', index_col=None)

In [135]:
#draw samples
mfgDfSample = mfgDf.sample(1000)
mfgDfSample["sample_type"] = "MFG"

nsfDfSample = nsfDf.sample(1000)
nsfDfSample["sample_type"] = "NON-MFG"

In [121]:
mfgDfSample.head()

Unnamed: 0,id,title,abstract,event_name,event_year,sample_type
2185,NSF_20140101_1462280,CHS: Medium: Collaborative Research: Immediate...,American Sign Language (ASL) is a primary mean...,NSF,2014.0,MFG
4080,CIRP_20120101_383-386,Raw part characterisation and automated alignm...,Large raw parts require a long time consuming ...,CIRP,2012.0,MFG
2627,NSF_20150101_1536918,Household-Level Use of Autonomous Vehicles: Mo...,"With recent development in vehicle automation,...",NSF,2015.0,MFG
4358,CIRP_20140101_058-1205,Alternation of analysis and synthesis for conc...,Concept generation involves both analysis and ...,CIRP,2014.0,MFG
466,NSF_20130101_1316583,"Workshop on Humanitarian Logistics Research, A...",This grant provides funding to host a workshop...,NSF,2013.0,MFG


In [127]:
#mfgf = createFreqDf(mfgDfSample)
#print(mfgf)
#nsff = createFreqDf(nsfDfSample)
#print(nsfgf)    


In [142]:
#positive sample raw csv
createFreqCSV(mfgDfSample, 'C:\\datascience\\data\\raw\\positive.csv')
#negative sample raw csv
createFreqCSV(nsfDfSample, 'C:\\datascience\\data\\raw\\negative.csv')

In [None]:
''' DB Script

CREATE TABLE `raw_keywords` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `doc` varchar(45) DEFAULT NULL,
  `word` varchar(145) DEFAULT NULL,
  `frequency` int(11) DEFAULT NULL,
  `word_count` int(11) DEFAULT NULL,
  `tf` double DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=589816 DEFAULT CHARSET=latin1;

truncate raw_keywords;

LOAD DATA LOCAL INFILE 'C:\\datascience\\data\\raw\\positive.csv' INTO TABLE raw_keywords
CHARACTER SET UTF8 
FIELDS TERMINATED BY ',' 
ENCLOSED BY '"' 
LINES TERMINATED BY '\n'
IGNORE 0 LINES
(doc, word, frequency, word_count, tf);

CREATE TABLE `raw_keywords_neg` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `doc` varchar(45) DEFAULT NULL,
  `word` varchar(145) DEFAULT NULL,
  `frequency` int(11) DEFAULT NULL,
  `word_count` int(11) DEFAULT NULL,
  `tf` double DEFAULT NULL,
  PRIMARY KEY (`id`)
);

truncate raw_keywords_neg;

LOAD DATA LOCAL INFILE 'C:\\datascience\\data\\raw\\negative.csv' INTO TABLE raw_keywords_neg
CHARACTER SET UTF8 
FIELDS TERMINATED BY ',' 
ENCLOSED BY '"' 
LINES TERMINATED BY '\n'
IGNORE 0 LINES
(doc, word, frequency, word_count, tf);

'''