## **Basic processing of news contents**

In [31]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [32]:
# Import the newscontent of scraped news articles from 1 Oct to 5 OCt

df=pd.read_csv('newscontent.csv',header=None)

df.columns=['content']

In [33]:
df.head(5)
df

Unnamed: 0,content
0,Open source browser-based cryptocurrency walle...
1,"Bitcoin is close to bottoming, and once it reb..."
2,Very very quiet week this. Not much new ransom...
3,A recent survey by Fundstrat Global Advisors h...
4,Is this the start of a new phase for Bitcoin o...
5,"On October 5, 2018, an article published by Bl..."
6,Track the price of your favourite cryptocurren...
7,An exchange-traded bitcoin fund (ETF) would si...
8,"""It is just not credible that the United State..."
9,"BitGo, a major Palo Alto-based cryptocurrency ..."


In [34]:
#drop news with no contents

df=df.dropna()

In [35]:
#display the content text length of every piece of news

df['text length'] = df['content'].apply(len)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,content,text length
0,Open source browser-based cryptocurrency walle...,274
1,"Bitcoin is close to bottoming, and once it reb...",274
2,Very very quiet week this. Not much new ransom...,274
3,A recent survey by Fundstrat Global Advisors h...,274
4,Is this the start of a new phase for Bitcoin o...,274
5,"On October 5, 2018, an article published by Bl...",274
6,Track the price of your favourite cryptocurren...,273
7,An exchange-traded bitcoin fund (ETF) would si...,274
8,"""It is just not credible that the United State...",275
9,"BitGo, a major Palo Alto-based cryptocurrency ...",274


## **Remove punctuation and stopwords**

In [53]:
import string

import nltk
from nltk.corpus import stopwords

#Split the words in contents

def text_process(text):
 
    nopunc = [char for char in text]
    nopunc = ''.join(nopunc)
    
    return [word for word in nopunc.split()]

In [54]:
content_text2=text_process(df['content'])

In [55]:
tokens=content_text2

# convert to lower case
tokens = [w.lower() for w in tokens]

# remove punctuation from each word
import string
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]

# remove remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]

# filter out stop words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
print(words[:100])

['open', 'source', 'browserbased', 'cryptocurrency', 'wallet', 'telescope', 'officially', 'announced', 'acquisition', 'bitmain', 'technologies', 'inc', 'largest', 'crypto', 'mining', 'rig', 'manufacturer', 'also', 'runs', 'one', 'extensive', 'cryptocurrency', 'mining', 'charsbitcoin', 'close', 'bottoming', 'rebounds', 'weeks', 'developments', 'fuel', 'climb', 'said', 'spencer', 'bogart', 'cryptocurrency', 'blockchain', 'venture', 'firm', 'blockchain', 'capital', 'percent', 'highs', 'think', 'bitcoin', 'charsvery', 'quiet', 'week', 'much', 'new', 'ransomware', 'report', 'released', 'well', 'known', 'variants', 'like', 'matrix', 'dharma', 'ransomware', 'infections', 'biggest', 'news', 'shut', 'numerous', 'restaurants', 'part', 'recipe', 'unlimited', 'charsa', 'recent', 'survey', 'fundstrat', 'global', 'advisors', 'revealed', 'bullish', 'sentiments', 'among', 'institutional', 'investors', 'regards', 'prospects', 'cryptocurrency', 'performance', 'economic', 'recession', 'investors', 'say',

## **Find the most frequent 100 words**

In [56]:
import nltk
freq_clean = nltk.FreqDist(words)
freq=freq_clean.most_common(100)

## **Write 100 frequent words into a file called corpora**

In [57]:
freq_corpora=pd.DataFrame(freq,columns=['Words','Frequency'])
freq_corpora.to_csv('corpora.csv',index_label = False,index=False)
freq_corpora

Unnamed: 0,Words,Frequency
0,bitcoin,320
1,cryptocurrency,227
2,new,143
3,blockchain,135
4,us,130
5,market,114
6,charsthe,101
7,crypto,88
8,exchange,84
9,digital,84
