## wet Files

In [1]:
import warc
import requests
from contextlib import closing
import io
from glob import glob
from langdetect import detect

In [2]:
warc_files = glob('*.wet.gz')

In [3]:
f = warc.open(warc_files[0])

In [4]:
temp_url = []
temp_content = []
for record in f:
    url = record.header.get('warc-target-uri',None)
    if not url:
        continue   
    text = record.payload.read().decode()
    
    temp_url.append(url)
    temp_content.append(text)

In [7]:
import pandas as pd

In [8]:
df = pd.DataFrame({"url":temp_url, "content":temp_content})

# Detect if content is english

In [25]:
import fasttext
path_to_pretrained_model = 'lid.176.bin'
fmodel = fasttext.load_model(path_to_pretrained_model)
def is_eng(x):
    try:
        return fmodel.predict(x)[0][0][-2:]
    except:
        return " "



In [15]:
df['content'] = df['content'].apply(lambda x:x.replace('\n',''))

In [26]:
df['lang'] = df['content'].apply(is_eng)

In [30]:
df = df[df['lang']=='en']

In [61]:
df = df.reset_index(drop=True)

In [53]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [69]:
temp_token = [token for token in nlp(df['content'][0])]

In [None]:
df['content_tokenized'] = df['content'].apply(lambda x:[token for token in nlp(x)])

In [32]:
from sklearn.feature_extraction.text import CountVectorizer

In [33]:
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')

In [34]:
dtm = cv.fit_transform(df['content'])

In [35]:
from sklearn.decomposition import LatentDirichletAllocation

In [36]:
LDA = LatentDirichletAllocation(n_components=7, random_state=42)

In [37]:
LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=7, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=42, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [47]:
LDA.components_.shape

(7, 415353)

In [48]:
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['22', '2016', 'people', 'pm', 'says', 'january', 'said', 'just', 'time', 'new', '12', '11', '2019', '10', 'like']


THE TOP 15 WORDS FOR TOPIC #1
['jo', 'var', 'height', 'width', 'return', '17', 'ri', '16', '14', '13', '15', '11', '12', '10', 'function']


THE TOP 15 WORDS FOR TOPIC #2
['view', 'white', 'products', '000', 'sale', 'best', 'airport', 'new', '00', 'product', 'design', '10', 'free', 'home', 'price']


THE TOP 15 WORDS FOR TOPIC #3
['service', 'page', 'url', 'information', 'online', 'http', 'doc', 'available', 'short', 'description', 'manual', 'file', 'pdf', 'www', 'com']


THE TOP 15 WORDS FOR TOPIC #4
['state', 'news', 'county', 'united', 'south', '30', '10', 'world', 'city', 'week', '2019', 'music', '2020', 'new', '00']


THE TOP 15 WORDS FOR TOPIC #5
['document', 'type', 'false', 'height', 'css', 'width', 'ui', 'left', 'options', 'length', 'null', 'data', 'return', 'var', 'function']


THE TOP 15 WORDS FOR TOPIC #6
['text', 'main', 'search