Merge pull request #19 from jetbrains-academy/sofia/find_unique_values

sofiiako · web-flow · commit 6b0a5542f678 · 2021-09-28T01:05:06.000+03:00
Added task Find Unique Values
diff --git a/NumPy/Compare Search/Find Unique Values/__init__.py b/NumPy/Compare Search/Find Unique Values/__init__.py
diff --git a/NumPy/Compare Search/Find Unique Values/data.csv b/NumPy/Compare Search/Find Unique Values/data.csv
@@ -0,0 +1,51 @@
+id,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric8
+3,67,63,89,54,39,9,90,56
+3,49,1,82,35,9,41,53,24
+5,85,47,44,39,92,20,95,78
+5,96,74,37,93,91,21,76,9
+1,15,1,40,28,58,27,20,58
+5,73,57,7,50,65,81,1,12
+1,86,5,17,26,16,24,79,62
+1,32,73,33,2,32,91,22,16
+2,7,45,46,69,73,96,98,35
+2,77,80,37,41,74,3,58,94
+2,15,61,55,48,60,16,86,76
+4,37,39,81,90,31,15,64,90
+5,40,37,23,88,3,82,59,60
+2,72,13,49,12,27,87,78,66
+1,24,57,26,33,15,66,49,68
+4,90,78,89,93,31,14,21,69
+1,72,95,32,93,53,25,10,92
+4,51,84,29,15,53,29,4,53
+5,86,50,54,9,10,31,36,97
+4,80,29,93,62,26,32,50,39
+4,73,92,75,87,23,38,32,43
+2,93,47,61,81,10,20,22,9
+5,40,19,96,53,21,89,30,90
+3,92,80,90,12,78,84,52,43
+3,14,82,17,98,86,75,94,44
+1,16,100,60,24,63,13,67,34
+4,86,76,73,92,59,73,26,28
+1,73,62,87,26,21,49,33,47
+5,66,47,56,87,62,10,38,41
+2,35,23,78,91,10,12,42,21
+3,99,22,55,99,38,53,37,57
+1,86,71,37,98,15,12,43,63
+1,8,76,22,70,41,50,25,49
+1,39,90,25,100,33,88,98,80
+5,55,70,64,51,49,10,44,73
+3,46,63,75,52,75,78,82,64
+4,85,5,14,45,9,77,14,86
+2,47,42,86,93,9,7,86,92
+3,87,72,78,72,81,75,96,85
+1,15,50,70,13,36,10,82,95
+1,85,74,88,71,30,14,21,53
+4,44,59,69,84,49,56,49,63
+1,30,13,4,3,9,69,58,67
+1,60,63,29,19,97,35,100,86
+5,95,20,7,23,78,97,61,6
+3,48,21,30,78,19,59,58,18
+2,22,14,95,50,81,90,98,64
+1,28,44,16,19,59,8,12,13
+4,36,43,30,56,11,23,13,12
+2,27,16,26,80,94,79,51,28
diff --git a/NumPy/Compare Search/Find Unique Values/task-info.yaml b/NumPy/Compare Search/Find Unique Values/task-info.yaml
@@ -0,0 +1,31 @@
+type: edu
+files:
+- name: task.py
+  visible: true
+  placeholders:
+  - offset: 26
+    length: 55
+    placeholder_text: '# TODO'
+  - offset: 97
+    length: 47
+    placeholder_text: '# TODO'
+  - offset: 156
+    length: 17
+    placeholder_text: '# TODO'
+  - offset: 216
+    length: 35
+    placeholder_text: '# TODO'
+  - offset: 275
+    length: 29
+    placeholder_text: '# TODO'
+  - offset: 333
+    length: 50
+    placeholder_text: '# TODO'
+- name: tests/test_task.py
+  visible: false
+- name: __init__.py
+  visible: false
+- name: tests/__init__.py
+  visible: false
+- name: data.csv
+  visible: true
diff --git a/NumPy/Compare Search/Find Unique Values/task.md b/NumPy/Compare Search/Find Unique Values/task.md
@@ -0,0 +1,61 @@
+## Find Unique Values
+
+[`numpy.unique`](https://numpy.org/doc/stable/reference/generated/numpy.unique.html) function is pretty 
+straightforward - it finds unique elements in the input array and returns them as a sorted array:
+
+```python
+print(np.unique([1, 1, 2, 2, 3, 3]))
+```
+Output:
+```text
+[1 2 3]
+```
+Additionally, `numpy.unique` can:
+
+- identify unique rows or columns of an array (when `axis` parameter is given, when not - search is performed on the **flattened** input array):
+
+```python
+a = np.array([[1, 2, 6], [4, 2, 3], [4, 2, 3]])
+print(np.unique(a))
+print(np.unique(a, axis=0))
+```
+Output:
+```text
+[1 2 3 4 6]
+[[1 2 6]
+ [4 2 3]]
+```
+
+- return the unique values and the number of occurrences of each unique value (`return_counts=True`):
+```python
+a = np.array([1, 2, 6, 4, 2, 3, 2])
+print(np.unique(a, return_counts=True))
+```
+Output:
+```text
+(array([1, 2, 3, 4, 6]), array([1, 3, 1, 1, 1]))
+```
+
+- return the index of the first occurrences of the unique values (`return_index=True`):
+
+```python
+a = np.array([1, 2, 6, 4, 2, 3, 2])
+unique, index = np.unique(a, return_index=True)
+print(unique, index)
+```
+Output:
+```text
+[1 2 3 4 6] [0 1 5 3 2]
+```
+### Task
+You are given a dataset in the file `data.csv`. The first column contains ids (class labels),
+all other columns - values for some metrics collected for each entry.
+1. [Load the dataset](course://NumPy/Array Basics/Reading and Writing Files) from the file into `csv`. Mind the header!
+2. [Split](course://NumPy/Array Indexing and Slicing/Indexing Basics) the dataset into `data` (a 2-D array) and `labels` (a 1-D array of **integers**).
+3. Determine the set of classes represented in the dataset (should be assigned to 
+the variable `classes`)
+4. Find unique values and their counts in the dataset (`data`).
+5. Find the index of the most frequent measurement value (`most_frequent_index`) and get the measurement itself
+`most_frequent_measurement` using that index.
+
+<div class="hint">For the last one you could use <code>numpy.argmax</code>.</div>
diff --git a/NumPy/Compare Search/Find Unique Values/task.py b/NumPy/Compare Search/Find Unique Values/task.py
@@ -0,0 +1,17 @@
+import numpy as np
+
+csv = np.genfromtxt('data.csv', delimiter=',', skip_header=1)
+data, labels = csv[:, 1:], np.array(csv[:, 0], dtype=np.int64)
+
+classes = np.unique(labels)
+unique_measurements, unique_data_counts = np.unique(data, return_counts=True)
+
+most_frequent_index = np.argmax(unique_data_counts)
+most_frequent_measurement = unique_measurements.flatten()[most_frequent_index]
+
+if __name__ == "__main__":
+    print(classes)
+    print(unique_data_counts)
+    print(most_frequent_index)
+    print(most_frequent_measurement)
+
diff --git a/NumPy/Compare Search/Find Unique Values/tests/__init__.py b/NumPy/Compare Search/Find Unique Values/tests/__init__.py
diff --git a/NumPy/Compare Search/Find Unique Values/tests/test_task.py b/NumPy/Compare Search/Find Unique Values/tests/test_task.py
@@ -0,0 +1,32 @@
+import unittest
+import numpy as np
+
+from task import *
+
+test_csv = np.genfromtxt('data.csv', delimiter=',', skip_header=1)
+test_data, test_labels = test_csv[:, 1:], np.array(test_csv[:, 0], dtype=np.int64)
+test_classes = np.unique(test_labels)
+test_unique_measurements, test_unique_data_counts = np.unique(test_data, return_counts=True)
+test_most_frequent_index = np.argmax(test_unique_data_counts)
+test_most_frequent_measurement = test_unique_measurements.flatten()[test_most_frequent_index]
+
+
+class TestCase(unittest.TestCase):
+    def test_data(self):
+        np.testing.assert_array_equal(csv, test_csv, err_msg='Dataset is imported improperly.')
+        np.testing.assert_array_equal(data, test_data, err_msg='Array of measurements is off.')
+        np.testing.assert_array_equal(labels, test_labels, err_msg='Labels array is off.')
+
+    def test_unique(self):
+        np.testing.assert_array_equal(classes, test_classes,
+                                      err_msg='The set of classes is wrong.')
+        np.testing.assert_array_equal(unique_measurements, test_unique_measurements,
+                                      err_msg='The set of unique measurements is wrong.')
+        np.testing.assert_array_equal(unique_data_counts, test_unique_data_counts,
+                                      err_msg='The set containing the number of occurrences of the unique values is wrong.')
+
+    def test_most_frequent(self):
+        self.assertEqual(most_frequent_index, test_most_frequent_index,
+                         msg="The index of the most frequent value is incorrect.")
+        self.assertEqual(most_frequent_measurement, test_most_frequent_measurement,
+                        msg="The most frequent value is identified incorrectly.")
diff --git a/NumPy/Compare Search/lesson-info.yaml b/NumPy/Compare Search/lesson-info.yaml
@@ -4,3 +4,4 @@ content:
 - Element-wise Comparison
 - Find maximum
 - Search
+- Find Unique Values