<a href="https://colab.research.google.com/github/jpfcabral/datastructure/blob/main/project1/DCA0209_Project_Unit_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!unzip -n "/content/drive/MyDrive/UFRN/Engenharia de Computação/Estrutura de Dados II/archive.zip"

Archive:  /content/drive/MyDrive/UFRN/Engenharia de Computação/Estrutura de Dados II/archive.zip


In [2]:
import csv
import random
import time

### Criando classe com estruturas e algoritmos de melhor performance

In [3]:
data_path = "/content/the-reddit-climate-change-dataset-comments.csv"

class RedditDataset:
  def __init__(self, csv_path):
    self.data = None

    with open(csv_path) as csv_file:
      reader = csv.reader(csv_file)
      rows = list(reader)
    
    self.header = rows[0]
    self.rows = rows[1:]

    self.sentiment_dict = {}
    for r in self.rows:
      self.sentiment_dict[int(r[-1])] = r

    self.data = {}
    for r in self.rows:
      self.data[r[1]] = r

  def find_element_fast(self, id):
    return self.data[id] if id in self.data.keys() else -1

  def find_element(self, id):
    for r in self.rows:
      if r[1] == id:
        return r
    return -1

  def find_messages_sentiment(self, inf=-1, sup=1):
    message_list = []
    for r in self.rows:
      sentiment = float(r[8]) if r[8] != '' else 0.0
      if sentiment > inf and sentiment < sup:
        message_list.append(r[7])
    return message_list
  
  def check_sum(self, soma):
    for i in range(len(self.rows)-1):
      for j in range(len(self.rows)-1):
        if int(self.rows[i][-1]) + int(self.rows[j][-1]) == soma:
          return self.rows[i], self.rows[j]
    return -1


  def check_sum_fast(self, soma):
    for sentiment in self.sentiment_dict.keys():
      res = sentiment - soma
      if res in self.sentiment_dict.keys():
        return self.sentiment_dict[sentiment], self.sentiment_dict[res]
    return -1

reddit_dataset = RedditDataset(data_path)

### Analisando busca por id

In [4]:
ids = [x[1] for x in reddit_dataset.rows]
ids_selected = [ids[random.randint(0, len(ids))] for _ in range(500)]

In [5]:
total_time_slow = 0
for id in ids_selected:
  start = time.time()
  reddit_dataset.find_element(id)
  end = time.time()
  total_time_slow += (end - start)

print(total_time_slow)

146.33182501792908


In [6]:
total_time_fast = 0
for id in ids_selected:
  start = time.time()
  reddit_dataset.find_element_fast(id)
  end = time.time()
  total_time_fast += (end - start)

print(total_time_fast)

0.0009725093841552734


### Encontrando elementos entre limites

In [7]:
len(reddit_dataset.find_messages_sentiment(-0.2, 0.1))

659445

### Checando soma de scores

In [8]:
scores = [random.randint(0,200) for _ in range(10000)]

In [9]:
total_time_slow = 0
for score in scores:
  start = time.time()
  reddit_dataset.check_sum(score)
  end = time.time()
  total_time_slow += (end - start)

print(total_time_slow)

44.123188734054565


In [10]:
total_time_fast = 0
for score in scores:
  start = time.time()
  reddit_dataset.check_sum_fast(score)
  end = time.time()
  total_time_fast += (end - start)

print(total_time_fast)

0.008278846740722656


### Testes

In [11]:
!pip install pytest pytest-sugar

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
%%file reddit_dataset_test.py
import pytest
import csv
import random
import time


data_path = "/content/the-reddit-climate-change-dataset-comments.csv"

class RedditDataset:
  def __init__(self, csv_path):
    self.data = None

    with open(csv_path) as csv_file:
      reader = csv.reader(csv_file)
      rows = list(reader)
    
    self.header = rows[0]
    self.rows = rows[1:]

    self.sentiment_dict = {}
    for r in self.rows:
      self.sentiment_dict[int(r[-1])] = r

    self.data = {}
    for r in self.rows:
      self.data[r[1]] = r

  def find_element_fast(self, id):
    return self.data[id] if id in self.data.keys() else -1

  def find_element(self, id):
    for r in self.rows:
      if r[1] == id:
        return r
    return -1

  def find_messages_sentiment(self, inf=-1, sup=1):
    message_list = []
    for r in self.rows:
      sentiment = float(r[8]) if r[8] != '' else 0.0
      if sentiment > inf and sentiment < sup:
        message_list.append(r[7])
    return message_list
  
  def check_sum(self, soma):
    for i in range(len(self.rows)-1):
      for j in range(len(self.rows)-1):
        if int(self.rows[i][-1]) + int(self.rows[j][-1]) == soma:
          return self.rows[i], self.rows[j]
    return -1


  def check_sum_fast(self, soma):
    for sentiment in self.sentiment_dict.keys():
      res = sentiment - soma
      if res in self.sentiment_dict.keys():
        return self.sentiment_dict[sentiment], self.sentiment_dict[res]
    return -1

@pytest.fixture(scope="session")
def data():
    return RedditDataset(data_path) 

def test_find_element(data):
  assert data.find_element('imlddn9') != -1

def test_find_element_error(data):
  assert data.find_element('imlddn9123456') == -1

def test_find_element_fast(data):
  assert data.find_element_fast('imlddn9') != -1

def test_find_element_fast_error(data):
  assert data.find_element_fast('imlddn9123456') == -1

def test_find_messages_sentiment(data):
  assert data.find_messages_sentiment(-0.1, 0.1) is not []

Overwriting reddit_dataset_test.py


In [13]:
!pytest /content/reddit_dataset_test.py -vv

[1mTest session starts (platform: linux, Python 3.7.14, pytest 3.6.4, pytest-sugar 0.9.5)[0m
cachedir: .pytest_cache
rootdir: /content, inifile:
plugins: typeguard-2.7.1, sugar-0.9.5

 [36mreddit_dataset_test.py[0m::test_find_element[0m [32m✓[0m                      [32m20% [0m[40m[32m█[0m[40m[32m█        [0m
 [36mreddit_dataset_test.py[0m::test_find_element_error[0m [32m✓[0m                [32m40% [0m[40m[32m█[0m[40m[32m█[0m[40m[32m█[0m[40m[32m█      [0m
 [36mreddit_dataset_test.py[0m::test_find_element_fast[0m [32m✓[0m                 [32m60% [0m[40m[32m█[0m[40m[32m█[0m[40m[32m█[0m[40m[32m█[0m[40m[32m█[0m[40m[32m█    [0m
 [36mreddit_dataset_test.py[0m::test_find_element_fast_error[0m [32m✓[0m           [32m80% [0m[40m[32m█[0m[40m[32m█[0m[40m[32m█[0m[40m[32m█[0m[40m[32m█[0m[40m[32m█[0m[40m[32m█[0m[40m[32m█  [0m
 [36mreddit_dataset_test.py[0m::test_find_messages_sentiment[0m [32m✓[0m         