<a href="https://colab.research.google.com/github/lengochai97/thesis/blob/master/notebooks/feature_construction/12_News_Features_Results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Connect to Google Drive

In [0]:
%%capture

import google.colab.drive

google.colab.drive.mount('/content/gdrive', force_remount=True)

# Install Spark and dependencies

In [0]:
import os

os.environ['HADOOP_VERSION'] = '2.7'
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'
os.environ['SPARK_HOME'] = '/opt/spark'
os.environ['SPARK_VERSION'] = '2.4.3'

In [0]:
%%capture

!wget -qN https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz
!tar -xzf spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz -C /opt
!rm spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz
!rm -rf /opt/spark
!ln -s /opt/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION /opt/spark
!pip install -q findspark

# Create SparkSession

In [0]:
import findspark

findspark.init()

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local[*]').getOrCreate()

# Read files

In [0]:
import json

import pyspark.sql.functions as F
import pyspark.sql.types as T

In [0]:
DATA_PATH = '/content/gdrive/My Drive/dataset/adressa/one_week'

In [0]:
with open(os.path.join(DATA_PATH, 'schema', 'news_features.json')) as file:
  news_features_schema = T.StructType.fromJson(json.load(file))

In [0]:
df_news_features = spark.read.json(os.path.join(DATA_PATH, 'news_features'), schema=news_features_schema)

# Check results

In [0]:
df_news_features.show(truncate=False)

+----------------------------------------+-----------+-----------------------+---------------------+
|newsId                                  |publishtime|categoryList           |categoryVector       |
+----------------------------------------+-----------+-----------------------+---------------------+
|ed39fa29deca0717bc6ff43a099076a6e542a050|1233225890 |[]                     |(30,[2],[1.0])       |
|daf0bcd2b4ddab61ce760835fa8a42c01775ef35|1407826736 |[pluss, nyheter]       |(30,[0,1],[1.0,1.0]) |
|7c146f6c7357bc08cb019c5f59fd3e13b2690682|1415446217 |[pluss, okonomi]       |(30,[1,7],[1.0,1.0]) |
|65d83b9b75b5322281f2970bd3707127df673ee0|1420835007 |[pluss, nyheter]       |(30,[0,1],[1.0,1.0]) |
|e07df239f1934efebedf70a4985f9ef66a9fe307|1421269371 |[pluss, nyheter]       |(30,[0,1],[1.0,1.0]) |
|814c6262b09fc2865a84e9fbd0eb9873304643f0|1421408924 |[]                     |(30,[2],[1.0])       |
|8efad3dcccf799278064c20832ee3766cab2f23d|1422177445 |[nyheter, trondheim]   |(30,[0,3],[1.

## Number of items

In [0]:
df_news_features.count()

1034

In [0]:
df_news_features.select(F.column('newsId')).distinct().count()

1034

## Disk usage

In [0]:
!du -sh /content/gdrive/My\ Drive/dataset/adressa/one_week/news_features

195K	/content/gdrive/My Drive/dataset/adressa/one_week/news_features


## `category` vocabulary

In [0]:
from pyspark.ml.feature import CountVectorizerModel

In [0]:
category_count_vectorizer = CountVectorizerModel.load(os.path.join(DATA_PATH, 'model', 'category_count_vectorizer'))

In [0]:
print('index\tvalue')
for i, x in enumerate(category_count_vectorizer.vocabulary):
  print(f'{i}\t{x}')

index	value
0	nyheter
1	pluss
2	
3	trondheim
4	100sport
5	sortrondelag
6	nordtrondelag
7	okonomi
8	vintersport
9	fotball
10	meninger
11	innenriks
12	kultur
13	magasin
14	utenriks
15	sjakk
16	sprek
17	ordetfritt
18	moreromsdal
19	andreidretter
20	idrettspolitikk
21	migration catalog
22	ballsport
23	mesterskap
24	tema
25	arets tronder
26	forbruker
27	kuriosa
28	hjem
29	politikk
